Spaces:

Kansallisarkisto
/

Multicentury-HTR-Demo

Running on T4

File size: 4,520 Bytes

1cf6710

from torchvision.transforms import v2 as transforms_v2
from torchvision.io import read_image, ImageReadMode
import numpy as np
import torch
import cv2

def load_with_torchvision(img_path):
    """
    Load an image using torchvision and convert to numpy array.

    Args:
        img_path (str or Path): Path to the image file

    Returns:
        numpy.ndarray: Image array in RGB format with shape (H, W, C)
    """
    # Read as tensor
    img_tensor = read_image(str(img_path), mode= ImageReadMode.RGB)
    # Convert to numpy: (C, H, W) -> (H, W, C)
    img_np = img_tensor.permute(1, 2, 0).numpy()
    return img_np

def preprocess_resize_torch_transform(image, max_size=1024, normalize=True):
    """
    Resize using torchvision.transforms.v2 (most concise, PyTorch only).

    Args:
        image: torch.Tensor (C, H, W) or PIL Image
        max_size: maximum size for the longer dimension
        normalize: whether to normalize to [0, 1] range

    Returns:
        torch.Tensor (C, H, W) or PIL Image (same type as input)
    """
    # Convert to tensor if numpy
    input_type = type(image)
    if isinstance(image, np.ndarray):
        image = torch.from_numpy(image)
        if image.ndim == 3 and image.shape[2] in [1, 3]:
            image = image.permute(2, 0, 1)

    c, h, w = image.shape if isinstance(image, torch.Tensor) else (None, *image.size[::-1])

    # Build transform pipeline
    transform_list = []

    # Add resize if needed
    if h > max_size or w > max_size:
        transform_list.append(transforms_v2.Resize(size=None, max_size=max_size, antialias=True))

    # Add normalization
    if normalize:
        transform_list.append(transforms_v2.ToDtype(torch.float32, scale=True))

    # Apply transforms
    if transform_list:
        transform = transforms_v2.Compose(transform_list)
        resized = transform(image)
    else:
        resized = image

    return resized

def upscale_mask_opencv(mask, bbox, upscaled_bbox_shape):
    """Upscale using OpenCV resize with nearest neighbor."""
    x1, y1, x2, y2 = map(int, bbox)
    cropped_mask = mask[y1:y2, x1:x2]
    mask_uint8 = cropped_mask.astype(np.uint8)
    upscaled = cv2.resize(mask_uint8, 
                         upscaled_bbox_shape, 
                         interpolation=cv2.INTER_NEAREST)

    return upscaled * 255

def upscale_bbox(bbox, original_shape, mask_shape):
    """
    Upscale bounding box coordinates from mask resolution to original image resolution.

    Parameters:
    -----------
    bbox : np.ndarray or list
        Bounding box coordinates in format [x_min, y_min, x_max, y_max]
        in the mask's coordinate system
    original_shape : tuple
        Original image shape (H, W) or (H, W, C) - e.g., (4545, 5527, 3)
    mask_shape : tuple
        Mask shape (H, W) - e.g., (631, 768)

    Returns:
    --------
    np.ndarray
        Upscaled bounding box as integer coordinates [x_min, y_min, x_max, y_max]
    """

    # Ensure bbox is a numpy array
    bbox = np.array(bbox)

    # Extract height and width from shapes
    original_h, original_w = original_shape[0], original_shape[1]
    mask_h, mask_w = mask_shape[0], mask_shape[1]

    # Calculate scale factors
    scale_x = original_w / mask_w  # Width scaling
    scale_y = original_h / mask_h  # Height scaling

    # Unpack bbox coordinates
    x_min, y_min, x_max, y_max = bbox

    # Scale coordinates
    x_min_scaled = x_min * scale_x
    y_min_scaled = y_min * scale_y
    x_max_scaled = x_max * scale_x
    y_max_scaled = y_max * scale_y

    # limit to range 0 to original width/height
    if x_min_scaled < 0:
        x_min_scaled = 0
    if y_min_scaled < 0:
        y_min_scaled = 0
    if x_max_scaled > original_w:
        x_max_scaled = original_w
    if y_max_scaled > original_h:
        y_max_scaled = original_h

    # Convert to integers (rounding to nearest)
    bbox_scaled = np.array([
        x_min_scaled,
        y_min_scaled,
        x_max_scaled,
        y_max_scaled
    ]).astype(np.int32)

    return bbox_scaled

def crop_line(image, mask, upscaledbbox):
    """Crops predicted text line based on the polygon coordinates
    and returns binarised text line image."""
    x1,y1,x2,y2 = upscaledbbox
    cropped_image = image[y1:y2,x1:x2,:]
    res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask)
    wbg = np.ones_like(cropped_image, np.uint8)*255
    cv2.bitwise_not(wbg,wbg, mask=mask)
    # Overlap the resulted cropped image on the white background
    dst = wbg+res
    return dst