Instructions to use NousResearch/OLMo-Bitnet-1B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use NousResearch/OLMo-Bitnet-1B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="NousResearch/OLMo-Bitnet-1B", trust_remote_code=True)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("NousResearch/OLMo-Bitnet-1B", trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("NousResearch/OLMo-Bitnet-1B", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use NousResearch/OLMo-Bitnet-1B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "NousResearch/OLMo-Bitnet-1B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NousResearch/OLMo-Bitnet-1B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/NousResearch/OLMo-Bitnet-1B
- SGLang
How to use NousResearch/OLMo-Bitnet-1B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "NousResearch/OLMo-Bitnet-1B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NousResearch/OLMo-Bitnet-1B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "NousResearch/OLMo-Bitnet-1B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "NousResearch/OLMo-Bitnet-1B", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use NousResearch/OLMo-Bitnet-1B with Docker Model Runner:
docker model run hf.co/NousResearch/OLMo-Bitnet-1B
| from dataclasses import fields | |
| from typing import List, Optional, Tuple, Union | |
| import torch | |
| import torch.nn.functional as F | |
| import math | |
| from transformers import PreTrainedModel | |
| from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast | |
| from transformers.models.auto import AutoModelForCausalLM | |
| from .config import ModelConfig | |
| from .model import OLMo | |
| from .configuration_olmo import OLMoConfig | |
| def create_model_config_from_pretrained_config(config: OLMoConfig): | |
| """ | |
| Utility function | |
| """ | |
| kwargs = {} | |
| for field in fields(ModelConfig): | |
| kwargs[field.name] = getattr(config, field.name) | |
| model_config = ModelConfig(**kwargs) | |
| return model_config | |
| class OLMoPreTrainedModel(PreTrainedModel): | |
| config_class = OLMoConfig | |
| base_model_prefix = "model" | |
| _no_split_modules = ["OLMoBlock"] | |
| # _skip_keys_device_placement = ["past_key_values", "causal_mask"] | |
| _skip_keys_device_placement = ["past_key_values"] | |
| def _init_weights(self, module): | |
| # `OLMoModel.reset_parameters` initializes weights of itself and its children | |
| if isinstance(module, OLMo): | |
| module.reset_parameters() | |
| class OLMoForCausalLM(OLMoPreTrainedModel): | |
| _tied_weights_keys = [] | |
| # _tied_weights_keys = ["transformer.wte.weight"] | |
| def __init__(self, config: OLMoConfig): | |
| super().__init__(config) | |
| self.model = OLMo(config) | |
| # Initialize weights and apply final processing | |
| self.post_init() | |
| def get_input_embeddings(self) -> torch.nn.Module: | |
| return self.model.transformer.wte | |
| def set_input_embeddings(self, value: torch.nn.Module): | |
| self.model.transformer.wte = value | |
| def get_output_embeddings(self): | |
| if self.config.weight_tying: | |
| return self.model.transformer.wte | |
| else: | |
| return self.model.transformer.ff_out | |
| def set_output_embeddings(self, value: torch.nn.Module): | |
| if self.config.weight_tying: | |
| self.model.transformer.wte = value | |
| else: | |
| self.model.transformer.ff_out = value | |
| def set_decoder(self, decoder): | |
| self.model = decoder | |
| def get_decoder(self): | |
| return self.model | |
| def forward( | |
| self, | |
| input_ids: torch.LongTensor = None, | |
| inputs_embeds: Optional[torch.FloatTensor] = None, | |
| attention_mask: Optional[torch.Tensor] = None, | |
| attention_bias: Optional[torch.Tensor] = None, | |
| past_key_values: Optional[List[torch.FloatTensor]] = None, | |
| labels: Optional[torch.LongTensor] = None, | |
| use_cache: Optional[bool] = None, | |
| output_attentions: Optional[bool] = None, | |
| output_hidden_states: Optional[bool] = None, | |
| return_dict: Optional[bool] = None, | |
| ) -> Union[Tuple, CausalLMOutputWithPast]: | |
| r""" | |
| Args: | |
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
| Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., | |
| config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored | |
| (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
| Returns: | |
| Example: | |
| ```python | |
| >>> from transformers import AutoTokenizer, OLMoForCausalLM | |
| >>> model = OLMoForCausalLM.from_pretrained("allenai/OLMo-7B") | |
| >>> tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-7B") | |
| >>> prompt = "Hey, are you conscious? Can you talk to me?" | |
| >>> inputs = tokenizer(prompt, return_tensors="pt") | |
| >>> # Generate | |
| >>> generate_ids = model.generate(inputs.input_ids, max_length=30) | |
| >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
| "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." | |
| ```""" | |
| output_attentions = output_attentions or self.config.output_attentions | |
| output_hidden_states = output_hidden_states or self.config.output_hidden_states | |
| use_cache = use_cache if use_cache is not None else self.config.use_cache | |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
| assert not output_attentions | |
| # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) | |
| base_output: Union[BaseModelOutputWithPast, Tuple] = self.model.forward( | |
| input_ids=input_ids, | |
| inputs_embeds=inputs_embeds, | |
| attention_mask=attention_mask, | |
| attention_bias=attention_bias, | |
| past_key_values=past_key_values, | |
| use_cache=use_cache, | |
| output_hidden_states=output_hidden_states, | |
| ) | |
| last_hidden_state = base_output.last_hidden_state if return_dict else base_output[0] | |
| # Get logits. | |
| # shape: (batch_size, seq_len or 1, vocab_size) | |
| if self.config.weight_tying: | |
| logits = F.linear(last_hidden_state, self.model.transformer.wte.weight, None) # type: ignore | |
| else: | |
| logits = self.model.transformer.ff_out(last_hidden_state) # type: ignore | |
| if self.config.scale_logits: | |
| logits.mul_(1 / math.sqrt(self.config.d_model)) | |
| loss = None | |
| if labels is not None: | |
| # Shift so that tokens < n predict n | |
| shift_logits = logits[..., :-1, :].contiguous() | |
| shift_labels = labels[..., 1:].contiguous() | |
| # Flatten the tokens | |
| loss_fct = torch.nn.CrossEntropyLoss() | |
| shift_logits = shift_logits.view(-1, self.config.vocab_size) | |
| shift_labels = shift_labels.view(-1) | |
| # Enable model parallelism | |
| shift_labels = shift_labels.to(shift_logits.device) | |
| loss = loss_fct(shift_logits, shift_labels) | |
| if not return_dict: | |
| output = (logits,) + base_output[1:] | |
| return (loss,) + output if loss is not None else output | |
| assert isinstance(base_output, BaseModelOutputWithPast) | |
| return CausalLMOutputWithPast( | |
| loss=loss, | |
| logits=logits, | |
| past_key_values=base_output.past_key_values, | |
| hidden_states=base_output.hidden_states, | |
| attentions=base_output.attentions, | |
| ) | |
| def prepare_inputs_for_generation( | |
| self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs | |
| ): | |
| if past_key_values: | |
| # This is because we want the model to only process the last generated token. | |
| input_ids = input_ids[:, -1:] | |
| model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values} | |
| kwargs.pop("cache_position") | |
| model_inputs.update(kwargs) | |
| # logger.warning("%s %s", kwargs.keys(), model_inputs.keys()) | |
| # model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache) | |
| return model_inputs | |
| def _reorder_cache(past_key_values, beam_idx): | |
| reordered_past = () | |
| for layer_past in past_key_values: | |
| reordered_past += ( | |
| tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), | |
| ) | |
| return reordered_past | |
| # Register the model so that it is available for transformer pipelines, auto-loading, etc. | |
| AutoModelForCausalLM.register(OLMoConfig, OLMoForCausalLM) | |