Spaces:
Sleeping
Sleeping
File size: 7,415 Bytes
49000c7 3519394 49000c7 5c8fd0e 49000c7 3519394 49000c7 3519394 49000c7 3519394 49000c7 3519394 49000c7 3519394 49000c7 8c70e7b 49000c7 5335722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 |
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware
import warnings
warnings.filterwarnings("ignore", message=".*_pytree_node.*")
import uvicorn
import os
import tempfile
import aiofiles
from datetime import datetime
import traceback
import logging
from typing import List, Optional
import time
from fastapi.responses import JSONResponse
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
app = FastAPI(title="Material Summarizer API")
from dotenv import load_dotenv
load_dotenv()
# Get URLs from environment
FRONTEND_URL = os.getenv('FRONTEND_URL')
BACKEND_URL = os.getenv('BACKEND_URL', 'http://localhost:5000')
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=[FRONTEND_URL, BACKEND_URL], # Adjust in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
max_age=600,
)
# Import processing functions
try:
from document_parser import parse_document
from summarizer import summarize_text
from utils import chunked_summarize
DEPENDENCIES_LOADED = True
logger.info("All AI dependencies loaded successfully")
except ImportError as e:
logger.error(f"Import error: {e}")
DEPENDENCIES_LOADED = False
@app.on_event("startup")
async def startup_event():
"""Handle startup events"""
logger.info("Application startup initiated")
# Load model on startup to avoid cold start delays
try:
from summarizer import get_summarizer
get_summarizer() # Pre-load the model
logger.info("Models pre-loaded successfully")
except Exception as e:
logger.warning(f"Model pre-loading failed: {e}")
@app.get("/")
async def root():
return {"message": "Material Summarizer API", "status": "running"}
@app.get("/health")
async def health_check():
"""Health check endpoint specifically for Hugging Face Spaces"""
status = "healthy" if DEPENDENCIES_LOADED else "missing_dependencies"
return JSONResponse(
content={
"status": status,
"service": "material-summarizer",
"dependencies_loaded": DEPENDENCIES_LOADED,
"timestamp": time.time()
},
status_code=200 if DEPENDENCIES_LOADED else 503
)
@app.get("/ping")
async def ping():
"""Simple ping endpoint for load balancers"""
return JSONResponse(
content={"status": "ok", "timestamp": time.time()},
status_code=200
)
@app.post("/summarize-document")
async def summarize_document(
file: UploadFile = File(...),
max_summary_length: Optional[int] = 1000,
chunk_size: Optional[int] = 1500
):
"""
Summarize uploaded document (PDF, DOCX, TXT, etc.)
"""
if not DEPENDENCIES_LOADED:
raise HTTPException(
status_code=500,
detail="Required AI dependencies not loaded. Check server logs."
)
temp_file_path = None
try:
# Validate file type
allowed_extensions = {'.pdf', '.docx', '.doc', '.txt', '.pptx', '.ppt'}
file_extension = os.path.splitext(file.filename)[1].lower()
if file_extension not in allowed_extensions:
raise HTTPException(
status_code=400,
detail=f"Unsupported document format. Allowed: {', '.join(allowed_extensions)}"
)
# Create temporary file
temp_file_path = f"temp_{file.filename}"
# Save uploaded file
logger.info(f"Saving uploaded file: {file.filename}")
async with aiofiles.open(temp_file_path, 'wb') as out_file:
content = await file.read()
await out_file.write(content)
start_time = datetime.now()
# 1. Parse document
logger.info("Step 1: Parsing document...")
if not os.path.exists(temp_file_path):
raise HTTPException(status_code=500, detail="Document file not found after upload")
document_text = parse_document(temp_file_path, file_extension)
logger.info(f"Extracted text length: {len(document_text)} characters")
if not document_text or len(document_text.strip()) < 10:
raise HTTPException(status_code=500, detail="Document parsing failed or content too short")
# 2. Summarize text with chunking
logger.info("Step 2: Generating summary...")
def custom_summarize_func(text):
return summarize_text(
text,
model_name="facebook/bart-large-cnn",
max_length=max_summary_length,
min_length=min(100, max_summary_length // 3)
)
final_summary = chunked_summarize(
text=document_text,
summarize_func=custom_summarize_func,
max_chunk_size=chunk_size
)
if not final_summary or len(final_summary.strip()) < 10:
raise HTTPException(status_code=500, detail="Summary generation failed")
processing_time = (datetime.now() - start_time).total_seconds()
logger.info(f"Summarization completed in {processing_time:.2f} seconds")
return {
"success": True,
"summary": final_summary,
"original_length": len(document_text),
"summary_length": len(final_summary),
"processing_time": processing_time,
"file_type": file_extension
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error processing document: {str(e)}")
logger.error(traceback.format_exc())
raise HTTPException(
status_code=500,
detail=f"Document processing failed: {str(e)}"
)
finally:
# Cleanup temporary files
try:
if temp_file_path and os.path.exists(temp_file_path):
os.remove(temp_file_path)
logger.info(f"Cleaned up: {temp_file_path}")
except Exception as cleanup_error:
logger.error(f"Cleanup error: {cleanup_error}")
@app.post("/batch-summarize")
async def batch_summarize_documents(files: List[UploadFile] = File(...)):
"""
Summarize multiple documents in batch
"""
if not DEPENDENCIES_LOADED:
raise HTTPException(
status_code=500,
detail="Required AI dependencies not loaded. Check server logs."
)
results = []
for file in files:
try:
# Use the single document summarization function
result = await summarize_document(file)
result["filename"] = file.filename
results.append(result)
except Exception as e:
results.append({
"success": False,
"filename": file.filename,
"error": str(e)
})
return {
"success": True,
"processed_files": len(results),
"results": results
}
if __name__ == "__main__":
logger.info("Starting Material Summarizer Server...")
logger.info("Dependencies loaded: %s", DEPENDENCIES_LOADED)
if not DEPENDENCIES_LOADED:
logger.error("CRITICAL: AI dependencies not loaded. Document processing will not work!")
port = int(os.environ.get("MATERIAL_PORT", 7860))
uvicorn.run(
"app:app",
host="0.0.0.0",
port=port,
reload=False
) |