File size: 7,415 Bytes
49000c7
 
 
 
 
 
 
 
 
 
 
 
3519394
 
49000c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c8fd0e
49000c7
 
 
3519394
49000c7
 
 
 
 
 
 
 
 
 
 
 
 
3519394
 
 
 
 
 
 
 
 
 
 
 
49000c7
 
 
 
 
 
3519394
49000c7
3519394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49000c7
3519394
49000c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c70e7b
49000c7
 
 
 
 
5335722
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import warnings
warnings.filterwarnings("ignore", message=".*_pytree_node.*")
import uvicorn
import os
import tempfile
import aiofiles
from datetime import datetime
import traceback
import logging
from typing import List, Optional
import time
from fastapi.responses import JSONResponse

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app = FastAPI(title="Material Summarizer API")

from dotenv import load_dotenv
load_dotenv()

# Get URLs from environment
FRONTEND_URL = os.getenv('FRONTEND_URL')
BACKEND_URL = os.getenv('BACKEND_URL', 'http://localhost:5000')

# CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=[FRONTEND_URL, BACKEND_URL],  # Adjust in production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
    max_age=600,
)

# Import processing functions
try:
    from document_parser import parse_document
    from summarizer import summarize_text
    from utils import chunked_summarize
    DEPENDENCIES_LOADED = True
    logger.info("All AI dependencies loaded successfully")
except ImportError as e:
    logger.error(f"Import error: {e}")
    DEPENDENCIES_LOADED = False

@app.on_event("startup")
async def startup_event():
    """Handle startup events"""
    logger.info("Application startup initiated")
    # Load model on startup to avoid cold start delays
    try:
        from summarizer import get_summarizer
        get_summarizer()  # Pre-load the model
        logger.info("Models pre-loaded successfully")
    except Exception as e:
        logger.warning(f"Model pre-loading failed: {e}")    

@app.get("/")
async def root():
    return {"message": "Material Summarizer API", "status": "running"}

@app.get("/health")
async def health_check():
    """Health check endpoint specifically for Hugging Face Spaces"""
    status = "healthy" if DEPENDENCIES_LOADED else "missing_dependencies"
    return JSONResponse(
        content={
            "status": status,
            "service": "material-summarizer",
            "dependencies_loaded": DEPENDENCIES_LOADED,
            "timestamp": time.time()
        },
        status_code=200 if DEPENDENCIES_LOADED else 503
    )


@app.get("/ping")
async def ping():
    """Simple ping endpoint for load balancers"""
    return JSONResponse(
        content={"status": "ok", "timestamp": time.time()},
        status_code=200
    )

    
@app.post("/summarize-document")
async def summarize_document(
    file: UploadFile = File(...),
    max_summary_length: Optional[int] = 1000,
    chunk_size: Optional[int] = 1500
):
    """
    Summarize uploaded document (PDF, DOCX, TXT, etc.)
    """
    if not DEPENDENCIES_LOADED:
        raise HTTPException(
            status_code=500,
            detail="Required AI dependencies not loaded. Check server logs."
        )

    temp_file_path = None

    try:
        # Validate file type
        allowed_extensions = {'.pdf', '.docx', '.doc', '.txt', '.pptx', '.ppt'}
        file_extension = os.path.splitext(file.filename)[1].lower()
        
        if file_extension not in allowed_extensions:
            raise HTTPException(
                status_code=400,
                detail=f"Unsupported document format. Allowed: {', '.join(allowed_extensions)}"
            )

        # Create temporary file
        temp_file_path = f"temp_{file.filename}"

        # Save uploaded file
        logger.info(f"Saving uploaded file: {file.filename}")
        async with aiofiles.open(temp_file_path, 'wb') as out_file:
            content = await file.read()
            await out_file.write(content)

        start_time = datetime.now()

        # 1. Parse document
        logger.info("Step 1: Parsing document...")
        if not os.path.exists(temp_file_path):
            raise HTTPException(status_code=500, detail="Document file not found after upload")

        document_text = parse_document(temp_file_path, file_extension)
        logger.info(f"Extracted text length: {len(document_text)} characters")

        if not document_text or len(document_text.strip()) < 10:
            raise HTTPException(status_code=500, detail="Document parsing failed or content too short")

        # 2. Summarize text with chunking
        logger.info("Step 2: Generating summary...")
        
        def custom_summarize_func(text):
            return summarize_text(
                text, 
                model_name="facebook/bart-large-cnn",
                max_length=max_summary_length,
                min_length=min(100, max_summary_length // 3)
            )

        final_summary = chunked_summarize(
            text=document_text,
            summarize_func=custom_summarize_func,
            max_chunk_size=chunk_size
        )

        if not final_summary or len(final_summary.strip()) < 10:
            raise HTTPException(status_code=500, detail="Summary generation failed")

        processing_time = (datetime.now() - start_time).total_seconds()

        logger.info(f"Summarization completed in {processing_time:.2f} seconds")

        return {
            "success": True,
            "summary": final_summary,
            "original_length": len(document_text),
            "summary_length": len(final_summary),
            "processing_time": processing_time,
            "file_type": file_extension
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Error processing document: {str(e)}")
        logger.error(traceback.format_exc())
        raise HTTPException(
            status_code=500,
            detail=f"Document processing failed: {str(e)}"
        )
    finally:
        # Cleanup temporary files
        try:
            if temp_file_path and os.path.exists(temp_file_path):
                os.remove(temp_file_path)
                logger.info(f"Cleaned up: {temp_file_path}")
        except Exception as cleanup_error:
            logger.error(f"Cleanup error: {cleanup_error}")

@app.post("/batch-summarize")
async def batch_summarize_documents(files: List[UploadFile] = File(...)):
    """
    Summarize multiple documents in batch
    """
    if not DEPENDENCIES_LOADED:
        raise HTTPException(
            status_code=500,
            detail="Required AI dependencies not loaded. Check server logs."
        )

    results = []
    
    for file in files:
        try:
            # Use the single document summarization function
            result = await summarize_document(file)
            result["filename"] = file.filename
            results.append(result)
        except Exception as e:
            results.append({
                "success": False,
                "filename": file.filename,
                "error": str(e)
            })

    return {
        "success": True,
        "processed_files": len(results),
        "results": results
    }

if __name__ == "__main__":
    logger.info("Starting Material Summarizer Server...")
    logger.info("Dependencies loaded: %s", DEPENDENCIES_LOADED)

    if not DEPENDENCIES_LOADED:
        logger.error("CRITICAL: AI dependencies not loaded. Document processing will not work!")

    port = int(os.environ.get("MATERIAL_PORT", 7860))
    uvicorn.run(
        "app:app",
        host="0.0.0.0",
        port=port,
        reload=False
    )