| | from typing import Dict, List, Any |
| | from transformers import AutoTokenizer, LayoutLMForSequenceClassification |
| | import torch |
| | import os |
| |
|
| |
|
| | os.system("apt install -y tesseract-ocr") |
| | os.system("pip3 install pytesseract==0.3.9") |
| |
|
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | self.tokenizer = AutoTokenizer.from_pretrained("microsoft/layoutlm-base-uncased") |
| | self.model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased") |
| |
|
| |
|
| | def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
| | words = ["Hello", "world"] |
| | normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] |
| | |
| | token_boxes = [] |
| | for word, box in zip(words, normalized_word_boxes): |
| | word_tokens = self.tokenizer.tokenize(word) |
| | token_boxes.extend([box] * len(word_tokens)) |
| | |
| | token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] |
| | |
| | encoding = self.tokenizer(" ".join(words), return_tensors="pt") |
| | input_ids = encoding["input_ids"] |
| | attention_mask = encoding["attention_mask"] |
| | token_type_ids = encoding["token_type_ids"] |
| | bbox = torch.tensor([token_boxes]) |
| | sequence_label = torch.tensor([1]) |
| | |
| | outputs = self.model( |
| | input_ids=input_ids, |
| | bbox=bbox, |
| | attention_mask=attention_mask, |
| | token_type_ids=token_type_ids, |
| | labels=sequence_label, |
| | ) |
| | |
| | loss = outputs.loss |
| | logits = outputs.logits |
| | return {"logits": logits.tolist()} |
| | |