from torchvision.transforms import v2 as transforms_v2 from torchvision.io import read_image, ImageReadMode import numpy as np import torch import cv2 def load_with_torchvision(img_path): """ Load an image using torchvision and convert to numpy array. Args: img_path (str or Path): Path to the image file Returns: numpy.ndarray: Image array in RGB format with shape (H, W, C) """ # Read as tensor img_tensor = read_image(str(img_path), mode= ImageReadMode.RGB) # Convert to numpy: (C, H, W) -> (H, W, C) img_np = img_tensor.permute(1, 2, 0).numpy() return img_np def preprocess_resize_torch_transform(image, max_size=1024, normalize=True): """ Resize using torchvision.transforms.v2 (most concise, PyTorch only). Args: image: torch.Tensor (C, H, W) or PIL Image max_size: maximum size for the longer dimension normalize: whether to normalize to [0, 1] range Returns: torch.Tensor (C, H, W) or PIL Image (same type as input) """ # Convert to tensor if numpy input_type = type(image) if isinstance(image, np.ndarray): image = torch.from_numpy(image) if image.ndim == 3 and image.shape[2] in [1, 3]: image = image.permute(2, 0, 1) c, h, w = image.shape if isinstance(image, torch.Tensor) else (None, *image.size[::-1]) # Build transform pipeline transform_list = [] # Add resize if needed if h > max_size or w > max_size: transform_list.append(transforms_v2.Resize(size=None, max_size=max_size, antialias=True)) # Add normalization if normalize: transform_list.append(transforms_v2.ToDtype(torch.float32, scale=True)) # Apply transforms if transform_list: transform = transforms_v2.Compose(transform_list) resized = transform(image) else: resized = image return resized def upscale_mask_opencv(mask, bbox, upscaled_bbox_shape): """Upscale using OpenCV resize with nearest neighbor.""" x1, y1, x2, y2 = map(int, bbox) cropped_mask = mask[y1:y2, x1:x2] mask_uint8 = cropped_mask.astype(np.uint8) upscaled = cv2.resize(mask_uint8, upscaled_bbox_shape, interpolation=cv2.INTER_NEAREST) return upscaled * 255 def upscale_bbox(bbox, original_shape, mask_shape): """ Upscale bounding box coordinates from mask resolution to original image resolution. Parameters: ----------- bbox : np.ndarray or list Bounding box coordinates in format [x_min, y_min, x_max, y_max] in the mask's coordinate system original_shape : tuple Original image shape (H, W) or (H, W, C) - e.g., (4545, 5527, 3) mask_shape : tuple Mask shape (H, W) - e.g., (631, 768) Returns: -------- np.ndarray Upscaled bounding box as integer coordinates [x_min, y_min, x_max, y_max] """ # Ensure bbox is a numpy array bbox = np.array(bbox) # Extract height and width from shapes original_h, original_w = original_shape[0], original_shape[1] mask_h, mask_w = mask_shape[0], mask_shape[1] # Calculate scale factors scale_x = original_w / mask_w # Width scaling scale_y = original_h / mask_h # Height scaling # Unpack bbox coordinates x_min, y_min, x_max, y_max = bbox # Scale coordinates x_min_scaled = x_min * scale_x y_min_scaled = y_min * scale_y x_max_scaled = x_max * scale_x y_max_scaled = y_max * scale_y # limit to range 0 to original width/height if x_min_scaled < 0: x_min_scaled = 0 if y_min_scaled < 0: y_min_scaled = 0 if x_max_scaled > original_w: x_max_scaled = original_w if y_max_scaled > original_h: y_max_scaled = original_h # Convert to integers (rounding to nearest) bbox_scaled = np.array([ x_min_scaled, y_min_scaled, x_max_scaled, y_max_scaled ]).astype(np.int32) return bbox_scaled def crop_line(image, mask, upscaledbbox): """Crops predicted text line based on the polygon coordinates and returns binarised text line image.""" x1,y1,x2,y2 = upscaledbbox cropped_image = image[y1:y2,x1:x2,:] res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask) wbg = np.ones_like(cropped_image, np.uint8)*255 cv2.bitwise_not(wbg,wbg, mask=mask) # Overlap the resulted cropped image on the white background dst = wbg+res return dst