Spaces:
Running
on
T4
Running
on
T4
File size: 4,520 Bytes
1cf6710 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from torchvision.transforms import v2 as transforms_v2
from torchvision.io import read_image, ImageReadMode
import numpy as np
import torch
import cv2
def load_with_torchvision(img_path):
"""
Load an image using torchvision and convert to numpy array.
Args:
img_path (str or Path): Path to the image file
Returns:
numpy.ndarray: Image array in RGB format with shape (H, W, C)
"""
# Read as tensor
img_tensor = read_image(str(img_path), mode= ImageReadMode.RGB)
# Convert to numpy: (C, H, W) -> (H, W, C)
img_np = img_tensor.permute(1, 2, 0).numpy()
return img_np
def preprocess_resize_torch_transform(image, max_size=1024, normalize=True):
"""
Resize using torchvision.transforms.v2 (most concise, PyTorch only).
Args:
image: torch.Tensor (C, H, W) or PIL Image
max_size: maximum size for the longer dimension
normalize: whether to normalize to [0, 1] range
Returns:
torch.Tensor (C, H, W) or PIL Image (same type as input)
"""
# Convert to tensor if numpy
input_type = type(image)
if isinstance(image, np.ndarray):
image = torch.from_numpy(image)
if image.ndim == 3 and image.shape[2] in [1, 3]:
image = image.permute(2, 0, 1)
c, h, w = image.shape if isinstance(image, torch.Tensor) else (None, *image.size[::-1])
# Build transform pipeline
transform_list = []
# Add resize if needed
if h > max_size or w > max_size:
transform_list.append(transforms_v2.Resize(size=None, max_size=max_size, antialias=True))
# Add normalization
if normalize:
transform_list.append(transforms_v2.ToDtype(torch.float32, scale=True))
# Apply transforms
if transform_list:
transform = transforms_v2.Compose(transform_list)
resized = transform(image)
else:
resized = image
return resized
def upscale_mask_opencv(mask, bbox, upscaled_bbox_shape):
"""Upscale using OpenCV resize with nearest neighbor."""
x1, y1, x2, y2 = map(int, bbox)
cropped_mask = mask[y1:y2, x1:x2]
mask_uint8 = cropped_mask.astype(np.uint8)
upscaled = cv2.resize(mask_uint8,
upscaled_bbox_shape,
interpolation=cv2.INTER_NEAREST)
return upscaled * 255
def upscale_bbox(bbox, original_shape, mask_shape):
"""
Upscale bounding box coordinates from mask resolution to original image resolution.
Parameters:
-----------
bbox : np.ndarray or list
Bounding box coordinates in format [x_min, y_min, x_max, y_max]
in the mask's coordinate system
original_shape : tuple
Original image shape (H, W) or (H, W, C) - e.g., (4545, 5527, 3)
mask_shape : tuple
Mask shape (H, W) - e.g., (631, 768)
Returns:
--------
np.ndarray
Upscaled bounding box as integer coordinates [x_min, y_min, x_max, y_max]
"""
# Ensure bbox is a numpy array
bbox = np.array(bbox)
# Extract height and width from shapes
original_h, original_w = original_shape[0], original_shape[1]
mask_h, mask_w = mask_shape[0], mask_shape[1]
# Calculate scale factors
scale_x = original_w / mask_w # Width scaling
scale_y = original_h / mask_h # Height scaling
# Unpack bbox coordinates
x_min, y_min, x_max, y_max = bbox
# Scale coordinates
x_min_scaled = x_min * scale_x
y_min_scaled = y_min * scale_y
x_max_scaled = x_max * scale_x
y_max_scaled = y_max * scale_y
# limit to range 0 to original width/height
if x_min_scaled < 0:
x_min_scaled = 0
if y_min_scaled < 0:
y_min_scaled = 0
if x_max_scaled > original_w:
x_max_scaled = original_w
if y_max_scaled > original_h:
y_max_scaled = original_h
# Convert to integers (rounding to nearest)
bbox_scaled = np.array([
x_min_scaled,
y_min_scaled,
x_max_scaled,
y_max_scaled
]).astype(np.int32)
return bbox_scaled
def crop_line(image, mask, upscaledbbox):
"""Crops predicted text line based on the polygon coordinates
and returns binarised text line image."""
x1,y1,x2,y2 = upscaledbbox
cropped_image = image[y1:y2,x1:x2,:]
res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask)
wbg = np.ones_like(cropped_image, np.uint8)*255
cv2.bitwise_not(wbg,wbg, mask=mask)
# Overlap the resulted cropped image on the white background
dst = wbg+res
return dst |