Spaces:

ND06-25
/

Slash

Sleeping

App Files Files Community

Slash / api /summarizer.py

ND06-25

first commit to AI repo

6880cd9 6 months ago

raw

history blame

8.27 kB

	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
	from typing import List, Dict, Any, Optional, Union
	import torch
	import logging
	from .utils import chunk_text

	logger = logging.getLogger(__name__)

	class BookSummarizer:
	"""
	Handles AI-powered text summarization using transformer models.
	"""

	def __init__(self, model_name: str = "facebook/bart-large-cnn"):
	"""
	Initialize the summarizer with a specific model.

	Args:
	model_name: Hugging Face model name for summarization
	"""
	self.model_name = model_name
	self.summarizer = None
	self.tokenizer = None
	self.model = None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"

	logger.info(f"Initializing summarizer with model: {model_name}")
	logger.info(f"Using device: {self.device}")

	def load_model(self):
	"""
	Load the summarization model and tokenizer.
	"""
	try:
	logger.info("Loading summarization model...")

	# Load tokenizer and model
	self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

	# Move model to appropriate device
	self.model.to(self.device)

	# Create pipeline
	self.summarizer = pipeline(
	"summarization",
	model=self.model,
	tokenizer=self.tokenizer,
	device=0 if self.device == "cuda" else -1
	)

	logger.info("Model loaded successfully")

	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	raise

	def summarize_text(self, text: str, max_length: int = 150, min_length: int = 50,
	do_sample: bool = False) -> Dict[str, Any]:
	"""
	Summarize a single text chunk.

	Args:
	text: Text to summarize
	max_length: Maximum length of summary
	min_length: Minimum length of summary
	do_sample: Whether to use sampling for generation

	Returns:
	Dictionary containing summary and metadata
	"""
	try:
	if not self.summarizer:
	self.load_model()

	# Check if text is too short
	if len(text.split()) < 50:
	return {
	'success': True,
	'summary': text,
	'original_length': len(text.split()),
	'summary_length': len(text.split()),
	'compression_ratio': 1.0
	}

	# Generate summary
	summary_result = self.summarizer(
	text,
	max_length=max_length,
	min_length=min_length,
	do_sample=do_sample,
	truncation=True
	)

	summary = summary_result[0]['summary_text']

	# Calculate compression ratio
	original_words = len(text.split())
	summary_words = len(summary.split())
	compression_ratio = summary_words / original_words if original_words > 0 else 0

	return {
	'success': True,
	'summary': summary,
	'original_length': original_words,
	'summary_length': summary_words,
	'compression_ratio': compression_ratio
	}

	except Exception as e:
	logger.error(f"Error summarizing text: {str(e)}")
	return {
	'success': False,
	'summary': '',
	'error': str(e)
	}

	def summarize_book(self, text: str, chunk_size: int = 1000, overlap: int = 100,
	max_length: int = 150, min_length: int = 50) -> Dict[str, Any]:
	"""
	Summarize a complete book by processing it in chunks.

	Args:
	text: Complete book text
	chunk_size: Size of each text chunk
	overlap: Overlap between chunks
	max_length: Maximum length of each summary
	min_length: Minimum length of each summary

	Returns:
	Dictionary containing complete summary and metadata
	"""
	try:
	logger.info("Starting book summarization...")

	# Split text into chunks
	chunks = chunk_text(text, chunk_size, overlap)
	logger.info(f"Split text into {len(chunks)} chunks")

	# Summarize each chunk
	chunk_summaries = []
	total_original_words = 0
	total_summary_words = 0

	for i, chunk in enumerate(chunks):
	logger.info(f"Processing chunk {i+1}/{len(chunks)}")

	result = self.summarize_text(chunk, max_length, min_length)

	if result['success']:
	chunk_summaries.append(result['summary'])
	total_original_words += result['original_length']
	total_summary_words += result['summary_length']
	else:
	logger.warning(f"Failed to summarize chunk {i+1}: {result.get('error', 'Unknown error')}")
	# Include original chunk if summarization fails
	chunk_summaries.append(chunk[:200] + "...")

	# Combine all summaries
	combined_summary = " ".join(chunk_summaries)

	# Create final summary if the combined summary is still too long
	if len(combined_summary.split()) > 500:
	logger.info("Creating final summary from combined summaries...")
	final_result = self.summarize_text(combined_summary, max_length=300, min_length=100)
	if final_result['success']:
	combined_summary = final_result['summary']

	# Calculate overall statistics
	overall_compression = total_summary_words / total_original_words if total_original_words > 0 else 0

	return {
	'success': True,
	'summary': combined_summary,
	'statistics': {
	'total_chunks': len(chunks),
	'total_original_words': total_original_words,
	'total_summary_words': total_summary_words,
	'overall_compression_ratio': overall_compression,
	'final_summary_length': len(combined_summary.split())
	},
	'chunk_summaries': chunk_summaries
	}

	except Exception as e:
	logger.error(f"Error in book summarization: {str(e)}")
	return {
	'success': False,
	'summary': '',
	'error': str(e)
	}

	def get_available_models(self) -> List[Dict[str, Union[str, int]]]:
	"""
	Get list of available summarization models.
	"""
	return [
	{
	'name': 'facebook/bart-large-cnn',
	'description': 'BART model fine-tuned on CNN news articles (recommended)',
	'max_length': 1024
	},
	{
	'name': 't5-small',
	'description': 'Small T5 model, faster but less accurate',
	'max_length': 512
	},
	{
	'name': 'facebook/bart-base',
	'description': 'Base BART model, balanced performance',
	'max_length': 1024
	}
	]

	def change_model(self, model_name: str):
	"""
	Change the summarization model.

	Args:
	model_name: New model name to use
	"""
	self.model_name = model_name
	self.summarizer = None
	self.tokenizer = None
	self.model = None
	logger.info(f"Model changed to: {model_name}")