Token Classification
Transformers
Safetensors
English
modernbert
fill-mask
orality
linguistics
multi-label
custom_code
Instructions to use HavelockAI/bert-token-classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use HavelockAI/bert-token-classifier with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="HavelockAI/bert-token-classifier", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("HavelockAI/bert-token-classifier", trust_remote_code=True) model = AutoModelForMaskedLM.from_pretrained("HavelockAI/bert-token-classifier", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
| """Custom multi-label token classifier — backbone-agnostic.""" | |
| import torch | |
| import torch.nn as nn | |
| from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel | |
| class MultiLabelCRF(nn.Module): | |
| """Independent CRF per marker type for multi-label BIO tagging.""" | |
| def __init__(self, num_types: int) -> None: | |
| super().__init__() | |
| self.num_types = num_types | |
| self.transitions = nn.Parameter(torch.empty(num_types, 3, 3)) | |
| self.start_transitions = nn.Parameter(torch.empty(num_types, 3)) | |
| self.end_transitions = nn.Parameter(torch.empty(num_types, 3)) | |
| # Placeholder — will be overwritten by loaded weights if present | |
| self.register_buffer("emission_bias", torch.zeros(1, 1, 1, 3)) | |
| self._reset_parameters() | |
| def _reset_parameters(self) -> None: | |
| nn.init.uniform_(self.transitions, -0.1, 0.1) | |
| nn.init.uniform_(self.start_transitions, -0.1, 0.1) | |
| nn.init.uniform_(self.end_transitions, -0.1, 0.1) | |
| with torch.no_grad(): | |
| self.transitions.data[:, 0, 2] = -10000.0 | |
| self.start_transitions.data[:, 2] = -10000.0 | |
| def _apply_emission_bias(self, emissions: torch.Tensor) -> torch.Tensor: | |
| if self.emission_bias is not None: | |
| return emissions + self.emission_bias | |
| return emissions | |
| def decode(self, emissions: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: | |
| """Viterbi decoding. | |
| Args: | |
| emissions: (batch, seq, num_types, 3) | |
| mask: (batch, seq) boolean | |
| Returns: (batch, seq, num_types) best tag sequences | |
| """ | |
| # Apply emission bias before decoding | |
| emissions = self._apply_emission_bias(emissions) | |
| batch, seq, num_types, _ = emissions.shape | |
| # Reshape to (batch*num_types, seq, 3) | |
| em = emissions.permute(0, 2, 1, 3).reshape(batch * num_types, seq, 3) | |
| mk = mask.unsqueeze(1).expand(-1, num_types, -1).reshape(batch * num_types, seq) | |
| BT = batch * num_types | |
| # Expand params across batch | |
| trans = ( | |
| self.transitions.unsqueeze(0).expand(batch, -1, -1, -1).reshape(BT, 3, 3) | |
| ) | |
| start = self.start_transitions.unsqueeze(0).expand(batch, -1, -1).reshape(BT, 3) | |
| end = self.end_transitions.unsqueeze(0).expand(batch, -1, -1).reshape(BT, 3) | |
| arange = torch.arange(BT, device=em.device) | |
| score = start + em[:, 0] | |
| history: list[torch.Tensor] = [] | |
| for i in range(1, seq): | |
| broadcast = score.unsqueeze(2) + trans + em[:, i].unsqueeze(1) | |
| best_score, best_prev = broadcast.max(dim=1) | |
| score = torch.where(mk[:, i].unsqueeze(1), best_score, score) | |
| history.append(best_prev) | |
| score = score + end | |
| _, best_last = score.max(dim=1) | |
| best_paths = torch.zeros(BT, seq, dtype=torch.long, device=em.device) | |
| seq_lengths = mk.sum(dim=1).long() | |
| best_paths[arange, seq_lengths - 1] = best_last | |
| for i in range(seq - 2, -1, -1): | |
| prev_tag = history[i][arange, best_paths[:, i + 1]] | |
| should_update = i < (seq_lengths - 1) | |
| best_paths[:, i] = torch.where(should_update, prev_tag, best_paths[:, i]) | |
| return best_paths.reshape(batch, num_types, seq).permute(0, 2, 1) | |
| class HavelockTokenConfig(PretrainedConfig): | |
| """Config that wraps any backbone config + our custom fields.""" | |
| model_type = "havelock_token_classifier" | |
| def __init__(self, num_types: int = 1, use_crf: bool = False, **kwargs): | |
| super().__init__(**kwargs) | |
| self.num_types = num_types | |
| self.use_crf = use_crf | |
| class HavelockTokenClassifier(PreTrainedModel): | |
| config_class = HavelockTokenConfig | |
| def __init__( | |
| self, config: HavelockTokenConfig, backbone: PreTrainedModel | None = None | |
| ): | |
| super().__init__(config) | |
| self.num_types = config.num_types | |
| self.use_crf = config.use_crf | |
| # Accept injected backbone (from_pretrained path) or build from config | |
| if backbone is not None: | |
| self.bert = backbone | |
| else: | |
| self.bert = AutoModel.from_config(config) | |
| self.dropout = nn.Dropout(getattr(config, "hidden_dropout_prob", 0.1)) | |
| self.classifier = nn.Linear(config.hidden_size, config.num_types * 3) | |
| if self.use_crf: | |
| self.crf = MultiLabelCRF(config.num_types) | |
| self.post_init() | |
| def from_backbone( | |
| cls, | |
| model_name: str, | |
| num_types: int, | |
| use_crf: bool = False, | |
| obi_bias: torch.Tensor | None = None, | |
| ) -> "HavelockTokenClassifier": | |
| """Build from a pretrained backbone name — the training entrypoint.""" | |
| backbone = AutoModel.from_pretrained(model_name) | |
| backbone_config = backbone.config | |
| config = HavelockTokenConfig( | |
| num_types=num_types, | |
| use_crf=use_crf, | |
| **backbone_config.to_dict(), | |
| ) | |
| model = cls(config, backbone=backbone) | |
| if use_crf and obi_bias is not None: | |
| model.crf.emission_bias = obi_bias.reshape(1, 1, 1, 3) | |
| return model | |
| def forward(self, input_ids, attention_mask=None, **kwargs): | |
| hidden = self.bert( | |
| input_ids=input_ids, attention_mask=attention_mask | |
| ).last_hidden_state | |
| hidden = self.dropout(hidden) | |
| logits = self.classifier(hidden) | |
| batch, seq, _ = logits.shape | |
| return logits.view(batch, seq, self.num_types, 3) | |
| def decode(self, input_ids, attention_mask=None): | |
| logits = self.forward(input_ids, attention_mask) | |
| if self.use_crf: | |
| mask = ( | |
| attention_mask.bool() | |
| if attention_mask is not None | |
| else torch.ones( | |
| logits.shape[:2], dtype=torch.bool, device=logits.device | |
| ) | |
| ) | |
| return self.crf.decode(logits, mask) | |
| return logits.argmax(dim=-1) | |