Spaces:

Kansallisarkisto
/

Multicentury-HTR-Demo

Sleeping

App Files Files Community

MikkoLipsanen commited on 18 days ago

Commit

eee8423

verified ·

1 Parent(s): df891fc

Update code to use 202509_onnx_small model

Browse files

Files changed (1) hide show

onnx_text_recognition.py +341 -93

onnx_text_recognition.py CHANGED Viewed

@@ -1,115 +1,363 @@
-from optimum.onnxruntime import ORTModelForVision2Seq
 from transformers import TrOCRProcessor
 import numpy as np
 import onnxruntime
 import math
 import cv2
 import os
 class TextRecognition:
     def __init__(self,
-                processor_path,
-                model_path,
-                device = 'cuda:0',
-                half_precision = False,
-                line_threshold = 10):
-        self.device = device
-        self.half_precision = half_precision
-        self.line_threshold = line_threshold
-        self.processor_path = processor_path
         self.model_path = model_path
-        self.processor = self.init_processor()
-        self.recognition_model = self.init_recognition_model()
-    def init_processor(self):
-        """Function for initializing the processor."""
         try:
-            processor = TrOCRProcessor.from_pretrained(self.processor_path, token=True)
-            return processor
         except Exception as e:
-            print('Failed to initialize processor: %s' % e)
-    def init_recognition_model(self):
-        """Function for initializing the text detection model."""
         sess_options = onnxruntime.SessionOptions()
-        sess_options.intra_op_num_threads = 3
-        sess_options.inter_op_num_threads = 3
         try:
-            recognition_model = ORTModelForVision2Seq.from_pretrained(self.model_path, token=True, session_options=sess_options, provider="CUDAExecutionProvider")
-            return recognition_model
         except Exception as e:
-            print('Failed to load the text recognition model: %s' % e)
-    def crop_line(self, image, polygon, height, width):
-        """Crops predicted text line based on the polygon coordinates
-        and returns binarised text line image."""
-        polygon = np.array([[int(lst[0]), int(lst[1])] for lst in polygon], dtype=np.int32)
-        rect = cv2.boundingRect(polygon)
-        cropped_image = image[rect[1]: rect[1] + rect[3], rect[0]: rect[0] + rect[2]]
         mask = np.zeros([cropped_image.shape[0], cropped_image.shape[1]], dtype=np.uint8)
-        cv2.drawContours(mask, [polygon- np.array([[rect[0],rect[1]]])], -1, (255, 255, 255), -1, cv2.LINE_AA)
-        res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask)
-        wbg = np.ones_like(cropped_image, np.uint8)*255
-        cv2.bitwise_not(wbg,wbg, mask=mask)
-        # Overlap the resulted cropped image on the white background
-        dst = wbg+res
-        return dst
-    def crop_lines(self, polygons, image, height, width):
-        """Returns a list of line images cropped following the detected polygon coordinates."""
         cropped_lines = []
         for i, polygon in enumerate(polygons):
-            cropped_line = self.crop_line(image, polygon, height, width)
-            cropped_lines.append(cropped_line)
         return cropped_lines
-    def get_scores(self, lgscores):
-        """Get exponent of log scores."""
-        scores = []
-        for lgscore in lgscores:
-            score = math.exp(lgscore)
-            scores.append(score)
-        return scores
-    def predict_text(self, cropped_lines):
-        """Functions for predicting text content from the cropped line images."""
-        pixel_values = self.processor(cropped_lines, return_tensors="pt").pixel_values
-        generated_dict = self.recognition_model.generate(pixel_values.to(self.device), max_new_tokens=128, return_dict_in_generate=True, output_scores=True)
-        generated_ids, lgscores = generated_dict['sequences'], generated_dict['sequences_scores']
-        scores = self.get_scores(lgscores.tolist())
-        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
-        return scores, generated_text
-    def get_text_lines(self, cropped_lines):
-        scores, generated_text = [], []
-        if len(cropped_lines) <= self.line_threshold:
-            scores, generated_text = self.predict_text(cropped_lines)
-        else:
-            n = math.ceil(len(cropped_lines) / self.line_threshold)
-            for i in range(n):
-                print(i)
-                start = int(i * self.line_threshold)
-                end = int(min(start + self.line_threshold, len(cropped_lines)))
-                sc, gt = self.predict_text(cropped_lines[start:end])
-                scores += sc
-                print(gt)
-                generated_text += gt
-        return scores, generated_text
-    def get_res_dict(self, polygons, generated_text, height, width, image_name, line_confs, scores):
-        """Combines the results in a dictionary form."""
-        line_dicts = []
-        for i in range(len(generated_text)):
-            line_dict = {'polygon': polygons[i], 'text': generated_text[i], 'conf': line_confs[i], 'text_conf':scores[i]}
-            line_dicts.append(line_dict)
-        lines_dict = {'img_name': image_name, 'height': height, 'width': width, 'text_lines': line_dicts}
-        return lines_dict
-    def process_lines(self, polygons, image, height, width):
-        # Crop line images
-        print('starting text generation')
-        cropped_lines = self.crop_lines(polygons, image, height, width)
-        print('cropped lines')
-        # Get text predictions
-        scores, generated_text = self.get_text_lines(cropped_lines)
         return generated_text

+from transformers import VisionEncoderDecoderConfig
+from typing import List, Tuple, Optional
 from transformers import TrOCRProcessor
+from pathlib import Path
 import numpy as np
 import onnxruntime
 import math
+import time
 import cv2
 import os
 class TextRecognition:
+    """
+    ONNX-based text recognition class using TrOCR for handwritten text recognition.
+    Processes text line images through an encoder-decoder architecture, supporting
+    batch processing and CUDA acceleration.
+    Args:
+        model_path: Path to the model directory containing ONNX models and config
+        device: Device identifier (default: 'cuda:0')
+        batch_size: Number of lines to process in parallel (default: 10)
+        img_height: Target height for input images (default: 192)
+        img_width: Target width for input images (default: 1024)
+        max_length: Maximum sequence length for generation (default: 128)
+    """
     def __init__(self,
+                model_path: str,
+                device: str = 'cuda:0',
+                batch_size: int = 10,
+                img_height: int = 192,
+                img_width: int = 1024,
+                max_length: int = 128):
         self.model_path = model_path
+        self.device = device
+        self.batch_size = batch_size
+        self.img_height = img_height
+        self.img_width = img_width
+        self.max_length = max_length
+        # Validate model path
+        if not os.path.exists(self.model_path):
+            raise FileNotFoundError(f"Model path does not exist: {model_path}")
+        self.init_processor()
+        self.init_recognition_model()
+    def init_processor(self) -> None:
+        """
+        Initialize the TrOCR processor with custom image dimensions.
+        Raises:
+            Exception: If processor initialization fails
+        """
         try:
+            self.processor = TrOCRProcessor.from_pretrained(
+                                                      str(self.model_path),
+                                                      use_fast=True,
+                                                      do_resize=True,
+                                                      size={
+                                                        'height': self.img_height,
+                                                        'width': self.img_width
+                                                      }
+                                                    )
+            print(f"✓ Processor loaded with custom image size: {self.img_height}x{self.img_width}")
         except Exception as e:
+            raise RuntimeError(f'Failed to initialize processor: {e}')
+    def init_recognition_model(self) -> None:
+        """
+        Initialize the ONNX encoder and decoder models with optimized settings.
+        Raises:
+            FileNotFoundError: If model files are not found
+            RuntimeError: If model loading fails
+        """
+        encoder_path = os.path.join(self.model_path, "encoder_model.onnx")
+        decoder_path = os.path.join(self.model_path, "decoder_model.onnx")
+        if not os.path.exists(encoder_path):
+            raise FileNotFoundError(f"Encoder model not found: {encoder_path}")
+        if not os.path.exists(decoder_path):
+            raise FileNotFoundError(f"Decoder model not found: {decoder_path}")
+        # Session options for better performance
         sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        sess_options.intra_op_num_threads = 4
+        providers = [
+            'CUDAExecutionProvider',
+            'CPUExecutionProvider'
+        ]
+        # Load model config
+        self.config = VisionEncoderDecoderConfig.from_pretrained(str(self.model_path))
         try:
+            print("Loading encoder...")
+            self.encoder = onnxruntime.InferenceSession(
+                str(encoder_path),
+                sess_options=sess_options,
+                providers=providers
+            )
+            print("Loading decoder...")
+            self.decoder = onnxruntime.InferenceSession(
+                str(decoder_path),
+                sess_options=sess_options,
+                providers=providers
+            )
+            # Report which provider is actually being used
+            encoder_provider = self.encoder.get_providers()[0]
+            decoder_provider = self.decoder.get_providers()[0]
+            print(f"✓ Using execution provider: Encoder={encoder_provider}, Decoder={decoder_provider}")
         except Exception as e:
+            raise RuntimeError(f'Failed to load recognition models: {e}')
+    def crop_line(self, image: np.ndarray, polygon: List[List[float]]) -> Optional[np.ndarray]:
+        """
+        Crop a text line from an image based on polygon coordinates.
+        Creates a masked crop where the polygon area contains the original image
+        and the background is filled with white pixels.
+        Args:
+            image: Source image as numpy array
+            polygon: List of [x, y] coordinate pairs defining the text line region
+        Returns:
+            Cropped and masked text line image, or None if polygon is invalid
+        """
+        # Convert polygon to integer coordinates
+        polygon_array = np.array([[int(pt[0]), int(pt[1])] for pt in polygon], dtype=np.int32)
+        # Get bounding rectangle
+        rect = cv2.boundingRect(polygon_array)
+        x, y, w, h = rect
+        # Validate rectangle
+        if w <= 0 or h <= 0:
+            print(f"Warning: Invalid bounding rect dimensions: {w}x{h}")
+            return None
+        # Crop image to bounding rectangle
+        cropped_image = image[y:y + h, x:x + w]
+        if cropped_image.size == 0:
+            print(f"Warning: Empty cropped image at rect {rect}")
+            return None
+        # Create mask for the polygon region
         mask = np.zeros([cropped_image.shape[0], cropped_image.shape[1]], dtype=np.uint8)
+        # Adjust polygon coordinates relative to the cropped region
+        polygon_offset = polygon_array - np.array([[x, y]])
+        cv2.drawContours(mask, [polygon_offset], -1, (255, 255, 255), -1, cv2.LINE_AA)
+        # Extract the polygon region from the cropped image
+        masked_region = cv2.bitwise_and(cropped_image, cropped_image, mask=mask)
+        # Create white background
+        white_background = np.ones_like(cropped_image, np.uint8) * 255
+        cv2.bitwise_not(white_background, white_background, mask=mask)
+        # Overlay the masked region on white background
+        result = white_background + masked_region
+        return result
+    def crop_lines(self, polygons: List[List[List[float]]], image: np.ndarray) -> List[np.ndarray]:
+        """
+        Crop multiple text lines from an image.
+        Args:
+            polygons: List of polygon coordinate lists
+            image: Source image
+        Returns:
+            List of cropped text line images (excluding any failed crops)
+        """
         cropped_lines = []
         for i, polygon in enumerate(polygons):
+            cropped_line = self.crop_line(image, polygon)
+            if cropped_line is not None:
+                cropped_lines.append(cropped_line)
+            else:
+                print(f"Warning: Failed to crop line {i}")
         return cropped_lines
+    def encode(self, pixel_values: np.ndarray) -> np.ndarray:
+        """
+        Encode image pixel values into hidden states using the vision encoder.
+        Args:
+            pixel_values: Preprocessed image tensor from TrOCRProcessor
+                         Shape: (batch_size, channels, height, width)
+        Returns:
+            Encoder hidden states for input to the decoder
+            Shape: (batch_size, sequence_length, hidden_size)
+        Raises:
+            RuntimeError: If encoding fails
+        """
+        try:
+            encoder_outputs = self.encoder.run(
+                None,
+                {"pixel_values": pixel_values}
+            )[0]
+            return encoder_outputs
+        except Exception as e:
+            raise RuntimeError(f'Failed to encode input: {e}')
+    def generate(self, encoder_outputs: np.ndarray, batch_size: int) -> np.ndarray:
+        """
+        Generate text tokens using autoregressive decoding with early stopping.
+        Implements per-sequence early stopping: sequences that generate EOS tokens
+        stop producing new tokens while others continue, improving efficiency.
+        Args:
+            encoder_outputs: Hidden states from the encoder
+                            Shape: (batch_size, sequence_length, hidden_size)
+            batch_size: Number of sequences in the batch
+        Returns:
+            Generated token IDs including start and end tokens
+            Shape: (batch_size, generated_length)
+        Raises:
+            RuntimeError: If generation fails
+        """
+        try:
+            # Initialize decoder input with start tokens
+            decoder_input_ids = np.full(
+                (batch_size, 1),
+                self.config.decoder_start_token_id,
+                dtype=np.int64
+            )
+            # Track which sequences have finished
+            finished = np.zeros(batch_size, dtype=bool)
+            for step in range(self.max_length):
+                # Run decoder to get next token logits
+                decoder_outputs = self.decoder.run(
+                    None,
+                    {
+                        "input_ids": decoder_input_ids,
+                        "encoder_hidden_states": encoder_outputs
+                    }
+                )[0]
+                # Get most likely next token for each sequence
+                next_token_logits = decoder_outputs[:, -1, :]
+                next_tokens = np.argmax(next_token_logits, axis=-1)
+                # Check if any sequences just generated EOS token
+                just_finished = (next_tokens == self.config.eos_token_id)
+                finished = finished | just_finished
+                ## Replace tokens with PAD for already finished sequences
+                next_tokens[finished] = self.config.pad_token_id
+                # Append new tokens to the sequence
+                next_tokens = next_tokens.reshape(-1, 1)
+                decoder_input_ids = np.concatenate([decoder_input_ids, next_tokens], axis=1)
+                # Stop when all sequences have finished
+                if np.all(finished):
+                    break
+            return decoder_input_ids
+        except Exception as e:
+            raise RuntimeError(f'Failed to generate output ids: {e}')
+    def predict_text(self, cropped_lines: List[np.ndarray]) -> List[str]:
+        """
+        Predict text content from cropped line images.
+        Args:
+            cropped_lines: List of cropped text line images
+        Returns:
+            List of predicted text strings
+        Raises:
+            RuntimeError: If prediction fails
+        """
+        try:
+            # Process image with TrOCR processor
+            # Use 'pt' (PyTorch) then convert to numpy, as 'np' is not supported by fast processors
+            pixel_values = self.processor(cropped_lines, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.numpy()
+            batch_size = pixel_values.shape[0]
+            #Encode images to hidden states
+            encoder_hidden_states = self.encode(pixel_values)
+            # Generate token sequences
+            generated_ids = self.generate(encoder_hidden_states, batch_size)
+            # Decode tokens to text
+            texts = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False
+            )
+            return texts
+        except Exception as e:
+            raise RuntimeError(f'Failed to predict text: {e}')
+    def get_text_lines(self, cropped_lines: List[np.ndarray]) -> List[str]:
+        """
+        Process text lines in batches to manage memory efficiently.
+        Args:
+            cropped_lines: List of all cropped line images
+        Returns:
+            List of predicted text strings for all lines
+        """
+        generated_text = []
+        # Process in batches
+        for i in range(0, len(cropped_lines), self.batch_size):
+            batch = cropped_lines[i:i + self.batch_size]
+            texts = self.predict_text(batch)
+            generated_text.extend(texts)
         return generated_text
+    def process_lines(self,
+                     polygons: List[List[List[float]]],
+                     image: np.ndarray) -> List[str]:
+        """
+        Complete pipeline: crop text lines and predict their content.
+        Args:
+            polygons: List of polygon coordinate lists defining text line regions
+            image: Source document image
+        Returns:
+            List of predicted text strings for each valid line
+        """
+        # Crop line images from the document
+        cropped_lines = self.crop_lines(polygons, image)
+        if not cropped_lines:
+            print("Warning: No valid cropped lines to process")
+            return []
+        # Get text predictions for all lines
+        generated_text = self.get_text_lines(cropped_lines)
+        return generated_text