| import { logger } from '@librechat/data-schemas'; |
| import { encoding_for_model as encodingForModel, get_encoding as getEncoding } from 'tiktoken'; |
| import type { Tiktoken, TiktokenModel, TiktokenEncoding } from 'tiktoken'; |
|
|
| interface TokenizerOptions { |
| debug?: boolean; |
| } |
|
|
| class Tokenizer { |
| tokenizersCache: Record<string, Tiktoken>; |
| tokenizerCallsCount: number; |
| private options?: TokenizerOptions; |
|
|
| constructor() { |
| this.tokenizersCache = {}; |
| this.tokenizerCallsCount = 0; |
| } |
|
|
| getTokenizer( |
| encoding: TiktokenModel | TiktokenEncoding, |
| isModelName = false, |
| extendSpecialTokens: Record<string, number> = {}, |
| ): Tiktoken { |
| let tokenizer: Tiktoken; |
| if (this.tokenizersCache[encoding]) { |
| tokenizer = this.tokenizersCache[encoding]; |
| } else { |
| if (isModelName) { |
| tokenizer = encodingForModel(encoding as TiktokenModel, extendSpecialTokens); |
| } else { |
| tokenizer = getEncoding(encoding as TiktokenEncoding, extendSpecialTokens); |
| } |
| this.tokenizersCache[encoding] = tokenizer; |
| } |
| return tokenizer; |
| } |
|
|
| freeAndResetAllEncoders(): void { |
| try { |
| Object.keys(this.tokenizersCache).forEach((key) => { |
| if (this.tokenizersCache[key]) { |
| this.tokenizersCache[key].free(); |
| delete this.tokenizersCache[key]; |
| } |
| }); |
| this.tokenizerCallsCount = 1; |
| } catch (error) { |
| logger.error('[Tokenizer] Free and reset encoders error', error); |
| } |
| } |
|
|
| resetTokenizersIfNecessary(): void { |
| if (this.tokenizerCallsCount >= 25) { |
| if (this.options?.debug) { |
| logger.debug('[Tokenizer] freeAndResetAllEncoders: reached 25 encodings, resetting...'); |
| } |
| this.freeAndResetAllEncoders(); |
| } |
| this.tokenizerCallsCount++; |
| } |
|
|
| getTokenCount(text: string, encoding: TiktokenModel | TiktokenEncoding = 'cl100k_base'): number { |
| this.resetTokenizersIfNecessary(); |
| try { |
| const tokenizer = this.getTokenizer(encoding); |
| return tokenizer.encode(text, 'all').length; |
| } catch (error) { |
| logger.error('[Tokenizer] Error getting token count:', error); |
| this.freeAndResetAllEncoders(); |
| const tokenizer = this.getTokenizer(encoding); |
| return tokenizer.encode(text, 'all').length; |
| } |
| } |
| } |
|
|
| const TokenizerSingleton = new Tokenizer(); |
|
|
| |
| |
| |
| |
| |
| |
| export async function countTokens(text = ''): Promise<number> { |
| return TokenizerSingleton.getTokenCount(text, 'cl100k_base'); |
| } |
|
|
| export default TokenizerSingleton; |
|
|