@@ -65,49 +65,50 @@ class TOKENIZER_TYPE(IntEnum):
65
65
66
66
# TODO: add models here, base models preferred
67
67
models = [
68
- {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
69
- {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
70
- {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
71
- {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
72
- {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
73
- {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
74
- {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
75
- {"name" : "falcon3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon3-7B-Base" , },
76
- {"name" : "bert-bge-large" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/BAAI/bge-large-zh-v1.5" , },
77
- {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
78
- {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
79
- {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
80
- {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
81
- {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
82
- {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
83
- {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
84
- {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
85
- {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
86
- {"name" : "jina-v1-en" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en" , },
87
- {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
88
- {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
89
- {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
90
- {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
91
- {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
92
- {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
93
- {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
94
- {"name" : "gemma" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2b" , },
95
- {"name" : "gemma-2" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2-9b" , },
96
- {"name" : "jais" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/core42/jais-13b" , },
97
- {"name" : "t5" , "tokt" : TOKENIZER_TYPE .UGM , "repo" : "https://huggingface.co/google-t5/t5-small" , },
98
- {"name" : "codeshell" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/WisdomShell/CodeShell-7B" , },
99
- {"name" : "tekken" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407" , },
100
- {"name" : "smollm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/HuggingFaceTB/SmolLM-135M" , },
101
- {'name' : "bloom" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigscience/bloom" , },
102
- {'name' : "gpt3-finnish" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/TurkuNLP/gpt3-finnish-small" , },
103
- {"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104
- {"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105
- {"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106
- {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107
- {"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108
- {"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109
- {"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
110
- {"name" : "deepseek-v3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-V3" },
68
+ {"name" : "llama-spm" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/meta-llama/Llama-2-7b-hf" , },
69
+ {"name" : "llama-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/meta-llama/Meta-Llama-3-8B" , },
70
+ {"name" : "phi-3" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" , },
71
+ {"name" : "deepseek-llm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base" , },
72
+ {"name" : "deepseek-coder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base" , },
73
+ {"name" : "falcon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/falcon-7b" , },
74
+ {"name" : "bert-bge" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/BAAI/bge-small-en-v1.5" , },
75
+ {"name" : "falcon3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/tiiuae/Falcon3-7B-Base" , },
76
+ {"name" : "bert-bge-large" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/BAAI/bge-large-zh-v1.5" , },
77
+ {"name" : "mpt" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mosaicml/mpt-7b" , },
78
+ {"name" : "starcoder" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigcode/starcoder2-3b" , },
79
+ {"name" : "gpt-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/openai-community/gpt2" , },
80
+ {"name" : "stablelm2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b" , },
81
+ {"name" : "refact" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/smallcloudai/Refact-1_6-base" , },
82
+ {"name" : "command-r" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/CohereForAI/c4ai-command-r-v01" , },
83
+ {"name" : "qwen2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Qwen/Qwen1.5-7B" , },
84
+ {"name" : "olmo" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/allenai/OLMo-1.7-7B-hf" , },
85
+ {"name" : "dbrx" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/databricks/dbrx-base" , },
86
+ {"name" : "jina-v1-en" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en" , },
87
+ {"name" : "jina-v2-en" , "tokt" : TOKENIZER_TYPE .WPM , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-en" , }, # WPM!
88
+ {"name" : "jina-v2-es" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-es" , },
89
+ {"name" : "jina-v2-de" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-de" , },
90
+ {"name" : "smaug-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct" , },
91
+ {"name" : "poro-chat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Poro-34B-chat" , },
92
+ {"name" : "jina-v2-code" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/jinaai/jina-embeddings-v2-base-code" , },
93
+ {"name" : "viking" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LumiOpen/Viking-7B" , }, # Also used for Viking 13B and 33B
94
+ {"name" : "gemma" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2b" , },
95
+ {"name" : "gemma-2" , "tokt" : TOKENIZER_TYPE .SPM , "repo" : "https://huggingface.co/google/gemma-2-9b" , },
96
+ {"name" : "jais" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/core42/jais-13b" , },
97
+ {"name" : "t5" , "tokt" : TOKENIZER_TYPE .UGM , "repo" : "https://huggingface.co/google-t5/t5-small" , },
98
+ {"name" : "codeshell" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/WisdomShell/CodeShell-7B" , },
99
+ {"name" : "tekken" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407" , },
100
+ {"name" : "smollm" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/HuggingFaceTB/SmolLM-135M" , },
101
+ {'name' : "bloom" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/bigscience/bloom" , },
102
+ {'name' : "gpt3-finnish" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/TurkuNLP/gpt3-finnish-small" , },
103
+ {"name" : "exaone" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" , },
104
+ {"name" : "phi-2" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/microsoft/phi-2" , },
105
+ {"name" : "chameleon" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/facebook/chameleon-7b" , },
106
+ {"name" : "minerva-7b" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0" , },
107
+ {"name" : "roberta-bpe" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/sentence-transformers/stsb-roberta-base" },
108
+ {"name" : "gigachat" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct" },
109
+ {"name" : "megrez" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/Infinigence/Megrez-3B-Instruct" },
110
+ {"name" : "deepseek-v3" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-V3" },
111
+ {"name" : "deepseek-r1-qwen" , "tokt" : TOKENIZER_TYPE .BPE , "repo" : "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" },
111
112
]
112
113
113
114
0 commit comments