Spaces:

venni16
/

material-summeraizer

Running

App Files Files Community

venni16 commited on 20 days ago

Commit

5335722

verified ·

1 Parent(s): 83ae875

Upload 7 files

Browse files

Files changed (7) hide show

.dockerignore +26 -0
Dockerfile +30 -0
app.py +204 -0
document_parser.py +142 -0
requirements.txt +15 -0
summarizer.py +122 -0
utils.py +93 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,26 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+venv
+.venv
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis
+.DS_Store
+*.swp
+*.swo
+*~

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.9-slim
+WORKDIR /code
+# Install system dependencies for document processing
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libmagic1 \
+    libmagic-dev \
+    poppler-utils \
+    antiword \
+    unrtf \
+    tesseract-ocr \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create temp directory for file processing
+RUN mkdir -p /tmp/materials
+# Expose port
+EXPOSE 7861
+# Start the application
+CMD uvicorn app:app --host 0.0.0.0 --port 7861

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+import os
+import tempfile
+import aiofiles
+from datetime import datetime
+import traceback
+import logging
+from typing import List, Optional
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="Material Summarizer API")
+from dotenv import load_dotenv
+load_dotenv()
+# Get URLs from environment
+FRONTEND_URL = os.getenv('FRONTEND_URL')
+BACKEND_URL = os.getenv('BACKEND_URL', 'http://localhost:5000')
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["FRONTEND_URL, BACKEND_URL"],  # Adjust in production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Import processing functions
+try:
+    from document_parser import parse_document
+    from summarizer import summarize_text
+    from utils import chunked_summarize
+    DEPENDENCIES_LOADED = True
+    logger.info("All AI dependencies loaded successfully")
+except ImportError as e:
+    logger.error(f"Import error: {e}")
+    DEPENDENCIES_LOADED = False
+@app.get("/")
+async def root():
+    return {"message": "Material Summarizer API", "status": "running"}
+@app.get("/health")
+async def health_check():
+    status = "healthy" if DEPENDENCIES_LOADED else "missing_dependencies"
+    return {
+        "status": status,
+        "service": "material-summarizer",
+        "dependencies_loaded": DEPENDENCIES_LOADED
+    }
+@app.post("/summarize-document")
+async def summarize_document(
+    file: UploadFile = File(...),
+    max_summary_length: Optional[int] = 1000,
+    chunk_size: Optional[int] = 1500
+):
+    """
+    Summarize uploaded document (PDF, DOCX, TXT, etc.)
+    """
+    if not DEPENDENCIES_LOADED:
+        raise HTTPException(
+            status_code=500,
+            detail="Required AI dependencies not loaded. Check server logs."
+        )
+    temp_file_path = None
+    try:
+        # Validate file type
+        allowed_extensions = {'.pdf', '.docx', '.doc', '.txt', '.pptx', '.ppt'}
+        file_extension = os.path.splitext(file.filename)[1].lower()
+        if file_extension not in allowed_extensions:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported document format. Allowed: {', '.join(allowed_extensions)}"
+            )
+        # Create temporary file
+        temp_file_path = f"temp_{file.filename}"
+        # Save uploaded file
+        logger.info(f"Saving uploaded file: {file.filename}")
+        async with aiofiles.open(temp_file_path, 'wb') as out_file:
+            content = await file.read()
+            await out_file.write(content)
+        start_time = datetime.now()
+        # 1. Parse document
+        logger.info("Step 1: Parsing document...")
+        if not os.path.exists(temp_file_path):
+            raise HTTPException(status_code=500, detail="Document file not found after upload")
+        document_text = parse_document(temp_file_path, file_extension)
+        logger.info(f"Extracted text length: {len(document_text)} characters")
+        if not document_text or len(document_text.strip()) < 10:
+            raise HTTPException(status_code=500, detail="Document parsing failed or content too short")
+        # 2. Summarize text with chunking
+        logger.info("Step 2: Generating summary...")
+        def custom_summarize_func(text):
+            return summarize_text(
+                text,
+                model_name="facebook/bart-large-cnn",
+                max_length=max_summary_length,
+                min_length=min(100, max_summary_length // 3)
+            )
+        final_summary = chunked_summarize(
+            text=document_text,
+            summarize_func=custom_summarize_func,
+            max_chunk_size=chunk_size
+        )
+        if not final_summary or len(final_summary.strip()) < 10:
+            raise HTTPException(status_code=500, detail="Summary generation failed")
+        processing_time = (datetime.now() - start_time).total_seconds()
+        logger.info(f"Summarization completed in {processing_time:.2f} seconds")
+        return {
+            "success": True,
+            "summary": final_summary,
+            "original_length": len(document_text),
+            "summary_length": len(final_summary),
+            "processing_time": processing_time,
+            "file_type": file_extension
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing document: {str(e)}")
+        logger.error(traceback.format_exc())
+        raise HTTPException(
+            status_code=500,
+            detail=f"Document processing failed: {str(e)}"
+        )
+    finally:
+        # Cleanup temporary files
+        try:
+            if temp_file_path and os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+                logger.info(f"Cleaned up: {temp_file_path}")
+        except Exception as cleanup_error:
+            logger.error(f"Cleanup error: {cleanup_error}")
+@app.post("/batch-summarize")
+async def batch_summarize_documents(files: List[UploadFile] = File(...)):
+    """
+    Summarize multiple documents in batch
+    """
+    if not DEPENDENCIES_LOADED:
+        raise HTTPException(
+            status_code=500,
+            detail="Required AI dependencies not loaded. Check server logs."
+        )
+    results = []
+    for file in files:
+        try:
+            # Use the single document summarization function
+            result = await summarize_document(file)
+            result["filename"] = file.filename
+            results.append(result)
+        except Exception as e:
+            results.append({
+                "success": False,
+                "filename": file.filename,
+                "error": str(e)
+            })
+    return {
+        "success": True,
+        "processed_files": len(results),
+        "results": results
+    }
+if __name__ == "__main__":
+    logger.info("Starting Material Summarizer Server...")
+    logger.info("Dependencies loaded: %s", DEPENDENCIES_LOADED)
+    if not DEPENDENCIES_LOADED:
+        logger.error("CRITICAL: AI dependencies not loaded. Document processing will not work!")
+    port = int(os.environ.get("MATERIAL_PORT", 7861))
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=port,
+        reload=False
+    )

document_parser.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import os
+import logging
+from typing import Optional
+import pdfplumber
+from docx import Document
+import PyPDF2
+from pptx import Presentation
+logger = logging.getLogger(__name__)
+def parse_document(file_path: str, file_extension: str) -> str:
+    """
+    Parse different document formats and extract text
+    """
+    try:
+        if file_extension == '.pdf':
+            return parse_pdf(file_path)
+        elif file_extension in ['.docx', '.doc']:
+            return parse_docx(file_path)
+        elif file_extension in ['.pptx', '.ppt']:
+            return parse_pptx(file_path)
+        elif file_extension == '.txt':
+            return parse_txt(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_extension}")
+    except Exception as e:
+        logger.error(f"Error parsing document {file_path}: {e}")
+        raise
+def parse_pdf(file_path: str) -> str:
+    """
+    Extract text from PDF using multiple methods for better coverage
+    """
+    text = ""
+    # Method 1: Use pdfplumber (better for text-based PDFs)
+    try:
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+    except Exception as e:
+        logger.warning(f"pdfplumber failed: {e}")
+    # Method 2: Use PyPDF2 as fallback
+    if not text.strip():
+        try:
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                for page in pdf_reader.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+        except Exception as e:
+            logger.warning(f"PyPDF2 failed: {e}")
+    if not text.strip():
+        raise ValueError("Could not extract text from PDF")
+    return clean_text(text)
+def parse_docx(file_path: str) -> str:
+    """
+    Extract text from DOCX/DOC files
+    """
+    try:
+        doc = Document(file_path)
+        text = ""
+        # Extract paragraphs
+        for paragraph in doc.paragraphs:
+            if paragraph.text.strip():
+                text += paragraph.text + "\n"
+        # Extract tables
+        for table in doc.tables:
+            for row in table.rows:
+                for cell in row.cells:
+                    if cell.text.strip():
+                        text += cell.text + "\n"
+        return clean_text(text)
+    except Exception as e:
+        logger.error(f"Error parsing DOCX file: {e}")
+        raise
+def parse_pptx(file_path: str) -> str:
+    """
+    Extract text from PowerPoint files
+    """
+    try:
+        prs = Presentation(file_path)
+        text = ""
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text.strip():
+                    text += shape.text + "\n"
+        return clean_text(text)
+    except Exception as e:
+        logger.error(f"Error parsing PPTX file: {e}")
+        raise
+def parse_txt(file_path: str) -> str:
+    """
+    Extract text from plain text files
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            text = file.read()
+        return clean_text(text)
+    except UnicodeDecodeError:
+        # Try different encodings
+        for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
+            try:
+                with open(file_path, 'r', encoding=encoding) as file:
+                    text = file.read()
+                return clean_text(text)
+            except UnicodeDecodeError:
+                continue
+        raise ValueError("Could not decode text file with any encoding")
+def clean_text(text: str) -> str:
+    """
+    Clean and normalize extracted text
+    """
+    # Remove excessive whitespace
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    # Remove very short lines that are likely formatting artifacts
+    meaningful_lines = [line for line in lines if len(line) > 2]
+    # Join with proper spacing
+    cleaned_text = '\n'.join(meaningful_lines)
+    # Remove multiple consecutive newlines
+    while '\n\n\n' in cleaned_text:
+        cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')
+    return cleaned_text.strip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.104.1
+uvicorn==0.24.0
+python-multipart==0.0.6
+aiofiles==23.2.1
+python-dotenv==1.0.0
+transformers==4.35.2
+torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
+accelerate==0.24.1
+sentence-transformers==2.2.2
+numpy==1.24.3
+pypdf2==3.0.1
+python-magic==0.4.27
+pdfplumber==0.10.3
+python-docx==1.1.0
+python-pptx==0.6.21

summarizer.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from transformers import pipeline, AutoTokenizer
+import logging
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Global summarizer instance for better performance
+_summarizer = None
+_tokenizer = None
+def get_summarizer(model_name: str = "facebook/bart-large-cnn"):
+    """Get or create summarizer instance with caching"""
+    global _summarizer, _tokenizer
+    if _summarizer is None:
+        try:
+            _summarizer = pipeline(
+                "summarization",
+                model=model_name,
+                tokenizer=model_name
+            )
+            _tokenizer = AutoTokenizer.from_pretrained(model_name)
+            logger.info(f"Summarizer model {model_name} loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load summarizer: {e}")
+            raise
+    return _summarizer, _tokenizer
+def summarize_text(
+    text: str,
+    model_name: str = "facebook/bart-large-cnn",
+    max_length: int = 500,
+    min_length: int = 200,
+    compression_ratio: Optional[float] = None
+) -> str:
+    """
+    Summarize text using transformer models with enhanced error handling
+    """
+    try:
+        summarizer, tokenizer = get_summarizer(model_name)
+        # If text is too short, return as is
+        if len(text.split()) < 30:
+            return text
+        # Calculate appropriate lengths
+        word_count = len(text.split())
+        if compression_ratio:
+            max_length = min(max_length, int(word_count * compression_ratio))
+            min_length = min(min_length, max_length // 2)
+        else:
+            # Adaptive length calculation
+            if word_count < 100:
+                max_length = min(100, word_count - 10)
+                min_length = max(30, max_length // 2)
+            elif word_count < 500:
+                max_length = min(150, word_count // 3)
+                min_length = max(50, max_length // 2)
+            else:
+                max_length = min(max_length, word_count // 4)
+                min_length = min(min_length, max_length // 3)
+        # Ensure min_length < max_length
+        min_length = min(min_length, max_length - 1)
+        # Tokenize to check length
+        tokens = tokenizer.encode(text)
+        if len(tokens) > tokenizer.model_max_length:
+            # Truncate if too long
+            tokens = tokens[:tokenizer.model_max_length - 100]
+            text = tokenizer.decode(tokens, skip_special_tokens=True)
+        logger.info(f"Summarizing text: {word_count} words -> {max_length} max tokens")
+        summary = summarizer(
+            text,
+            max_length=max_length,
+            min_length=min_length,
+            do_sample=False,
+            truncation=True,
+            clean_up_tokenization_spaces=True
+        )
+        result = summary[0]['summary_text'].strip()
+        if not result or len(result.split()) < 3:
+            raise ValueError("Generated summary is too short or empty")
+        return result
+    except Exception as e:
+        logger.error(f"Summarization error: {e}")
+        # Enhanced fallback: extract key sentences
+        return extract_key_sentences(text, min(3, max_length // 50))
+def extract_key_sentences(text: str, num_sentences: int = 3) -> str:
+    """
+    Fallback method to extract key sentences when summarization fails
+    """
+    sentences = text.split('.')
+    meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
+    if not meaningful_sentences:
+        return text[:500] + "..." if len(text) > 500 else text
+    # Simple heuristic: take first, middle, and last sentences
+    if len(meaningful_sentences) <= num_sentences:
+        return '. '.join(meaningful_sentences) + '.'
+    key_indices = [0]  # First sentence
+    # Add a middle sentence
+    if len(meaningful_sentences) > 2:
+        key_indices.append(len(meaningful_sentences) // 2)
+    # Add last sentence
+    key_indices.append(len(meaningful_sentences) - 1)
+    key_sentences = [meaningful_sentences[i] for i in key_indices[:num_sentences]]
+    return '. '.join(key_sentences) + '.'

utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import logging
+from typing import List, Callable
+logger = logging.getLogger(__name__)
+def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
+    """
+    Split text into overlapping chunks for processing long documents
+    """
+    chunks = []
+    start = 0
+    text_length = len(text)
+    # If text is shorter than chunk_size, return as single chunk
+    if text_length <= chunk_size:
+        return [text]
+    while start < text_length:
+        end = min(start + chunk_size, text_length)
+        # Try to break at sentence boundary
+        if end < text_length:
+            # Look for sentence end in the last 100 characters of chunk
+            sentence_end = max(
+                text.rfind('. ', start, end),
+                text.rfind('? ', start, end),
+                text.rfind('! ', start, end)
+            )
+            if sentence_end > start + chunk_size * 0.7:  # Only if reasonable
+                end = sentence_end + 1
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        start = end - overlap if end - overlap > start else end
+        # Prevent infinite loop
+        if start >= text_length:
+            break
+    return chunks
+def chunked_summarize(
+    text: str,
+    summarize_func: Callable,
+    max_chunk_size: int = 1500,
+    overlap: int = 200
+) -> str:
+    """
+    Summarize long text by processing in chunks and combining results
+    """
+    if len(text) <= max_chunk_size:
+        return summarize_func(text)
+    text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=overlap)
+    logger.info(f"Processing {len(text_chunks)} chunks...")
+    partial_summaries = []
+    for i, chunk in enumerate(text_chunks):
+        logger.info(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
+        try:
+            summary = summarize_func(chunk)
+            if summary and len(summary.strip()) > 10:
+                partial_summaries.append(summary)
+        except Exception as e:
+            logger.warning(f"Failed to summarize chunk {i+1}: {e}")
+            # Include original chunk as fallback
+            partial_summaries.append(chunk[:200] + "...")
+    if not partial_summaries:
+        return "Unable to generate summary from the document."
+    combined_summary_input = " ".join(partial_summaries)
+    # Final summarization if combined text is still long
+    if len(combined_summary_input) > max_chunk_size:
+        logger.info("Final summarization of combined chunks...")
+        try:
+            return summarize_func(combined_summary_input)
+        except Exception as e:
+            logger.error(f"Final summarization failed: {e}")
+            # Return the combined partial summaries
+            return combined_summary_input
+    return combined_summary_input
+def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
+    """
+    Estimate reading time in minutes
+    """
+    word_count = len(text.split())
+    return max(1, round(word_count / words_per_minute))