Latvian named entity recognition (NER)
Dataset
Trained on the FullStack dataset.
Results
Results on the test split:
| Label | Precision | Recall | F1 Score |
|---|---|---|---|
| Micro Avg | 87.2 | 87.9 | 87.6 |
| Macro Avg | 76.6 | 73.1 | 73.8 |
| GPE | 93.2 | 93.2 | 93.2 |
| entity | 50.0 | 55.2 | 52.5 |
| event | 72.0 | 81.8 | 76.6 |
| location | 81.5 | 78.6 | 80.0 |
| money | 60.0 | 25.0 | 35.3 |
| organization | 87.2 | 89.2 | 88.2 |
| person | 96.5 | 98.4 | 97.4 |
| product | 75.0 | 58.1 | 65.5 |
| time | 73.8 | 78.3 | 75.9 |
Usage
import re
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
class NER:
def __init__(self, model_name='AiLab-IMCS-UL/lv-ner-v1', max_length=1024):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name).eval()
self.id2label = self.model.config.id2label
self.max_length = max_length
def predict(self, text):
pretokenized = list(re.finditer(r'\w+|\S', text))
if not pretokenized:
return []
enc = self.tokenizer([m.group(0) for m in pretokenized], is_split_into_words=True, return_tensors='pt', truncation=True, max_length=self.max_length)
word_ids = enc.word_ids(0)
with torch.no_grad():
preds = self.model(**enc).logits.argmax(-1)[0].tolist()
offsets = [(m.start(), m.end()) for m in pretokenized]
ents, cur, prev = [], None, None
for pred, wid in zip(preds, word_ids):
if wid is None or wid == prev:
prev = wid
continue
prev = wid
start, end = offsets[wid]
raw_label = self.id2label[pred]
if raw_label == 'O':
if cur:
ents.append(cur)
cur = None
continue
prefix, label = raw_label.split('-', 1) if '-' in raw_label else ('B', raw_label)
if prefix == 'B' or not cur or cur['label'] != label:
if cur:
ents.append(cur)
cur = {'start': start, 'end': end, 'label': label}
else:
cur['end'] = end
if cur:
ents.append(cur)
for ent in ents:
ent['text'] = text[ent['start']:ent['end']]
return ents
m = NER()
print(m.predict('Jānis Bērziņš strādā Latvijas uzņēmumā SIA Mia.'))
- Downloads last month
- 51
Model tree for AiLab-IMCS-UL/lv-ner-v1
Base model
AiLab-IMCS-UL/lv-deberta-base