The model to consider. <a href="https://huggingface.co/serpdotai/s

[sparsetral and Qwen2idae]: support for mixtral of lora,about pygmalionai/aphrodite-engine

Comments (12)

AlpinDale commented on June 11, 2024 1

Ah I see what the issue is.

We're using a custom GGUF model parser in aphrodite, so it means everything needs to be hand-written and implemented for every model arch. Llama, mistral, et al. fall under the llama category of models in llama.cpp, so their tensors and configs match with every llama model. Models like qwen2, command-r, etc, are supported by llama.cpp but use different names for tensors. To add support for these, we'd have to handle every model individually. I haven't gotten around to doing it yet, it'd need a fair bit of work. If you (or anyone else) would like to contribute for that, I'd start looking at these two places:

aphrodite-engine/aphrodite/transformers_utils/tokenizer.py

Lines 16 to 65 in ed225f5

 def convert_gguf_to_tokenizer(checkpoint): 

 result = GGUFReader(checkpoint) 

 # write vocab 

 sentencepiece_model_pb2 = import_protobuf() 

 vocab = sentencepiece_model_pb2.ModelProto() 

 vocab_size = len(result.fields['tokenizer.ggml.token_type'].data) 

 vocab.trainer_spec.model_type = 2 # BPE 

 vocab.trainer_spec.vocab_size = vocab_size 

 vocab.trainer_spec.byte_fallback = True 

 vocab.normalizer_spec.remove_extra_whitespaces = False 

 tokens = result.fields['tokenizer.ggml.tokens'] 

 scores = result.fields['tokenizer.ggml.scores'] 

 types = result.fields['tokenizer.ggml.token_type'] 

 for i in range(vocab_size): 

 new_token = vocab.SentencePiece() 

 new_token.piece = str(bytes(tokens.parts[tokens.data[i]]), 

 encoding='utf-8') 

 new_token.score = scores.parts[scores.data[i]] 

 # llama.cpp tokentype is the same with sentencepiece token type 

 new_token.type = int(types.parts[types.data[i]]) 

 vocab.pieces.append(new_token) 

 with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file: 

 temp_file.write(vocab.SerializeToString()) 

 temp_file_filename = temp_file.name 

 tokenizer_args = {"vocab_file": temp_file_filename} 

 if 'tokenizer.ggml.bos_token_id' in result.fields: 

 tokenizer_args["bos_token"] = vocab.pieces[int( 

 result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece 

 if 'tokenizer.ggml.eos_token_id' in result.fields: 

 tokenizer_args["eos_token"] = vocab.pieces[int( 

 result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece 

 if 'tokenizer.ggml.padding_token_id' in result.fields: 

 tokenizer_args["pad_token"] = vocab.pieces[int( 

 result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece 

 if 'tokenizer.ggml.unknown_token_id' in result.fields: 

 tokenizer_args["unk_token"] = vocab.pieces[int( 

 result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece 

 if 'tokenizer.ggml.add_bos_token' in result.fields: 

 tokenizer_args["add_bos_token"] = bool( 

 result.fields['tokenizer.ggml.add_bos_token'].parts[-1]) 

 if 'tokenizer.ggml.add_eos_token' in result.fields: 

 tokenizer_args["add_eos_token"] = bool( 

 result.fields['tokenizer.ggml.add_eos_token'].parts[-1]) 

 if 'tokenizer.chat_template' in result.fields: 

 tokenizer_args["chat_template"] = str( 

 bytes(result.fields['tokenizer.chat_template'].parts[-1])) 

 tokenizer = LlamaTokenizer(**tokenizer_args) 

 os.unlink(temp_file_filename) 

 return tokenizer

aphrodite-engine/aphrodite/modeling/hf_downloader.py

Lines 208 to 281 in ed225f5

 def convert_gguf_to_state_dict(checkpoint, config): 

 if not os.path.isfile(checkpoint): 

 raise RuntimeError( 

 f"Cannot find any model weights with `{checkpoint}`") 

 result = GGUFReader(checkpoint) 

 # write tensor 

 kv_dim = (config.hidden_size // config.num_attention_heads * 

 config.num_key_value_heads) 

 tensor_mapping = { 

 "token_embd": ("model.embed_tokens", config.vocab_size), 

 "output": ("lm_head", config.vocab_size), 

 "output_norm": ("model.norm", -1), 

 "blk.{bid}.attn_norm": ("model.layers.{bid}.input_layernorm", -1), 

 "blk.{bid}.attn_q": ("model.layers.{bid}.self_attn.q_proj", 

 config.hidden_size), 

 "blk.{bid}.attn_k": ("model.layers.{bid}.self_attn.k_proj", kv_dim), 

 "blk.{bid}.attn_v": ("model.layers.{bid}.self_attn.v_proj", kv_dim), 

 "blk.{bid}.attn_output": ("model.layers.{bid}.self_attn.o_proj", 

 config.hidden_size), 

 "blk.{bid}.attn_rot_embd": 

 ("model.layers.{bid}.self_attn.rotary_emb.inv_freq", -1), 

 "blk.{bid}.ffn_norm": ("model.layers.{bid}.post_attention_layernorm", 

 -1), 

 "blk.{bid}.ffn_up": ("model.layers.{bid}.mlp.up_proj", 

 config.intermediate_size), 

 "blk.{bid}.ffn_down": ("model.layers.{bid}.mlp.down_proj", 

 config.hidden_size), 

 "blk.{bid}.ffn_gate": ("model.layers.{bid}.mlp.gate_proj", 

 config.intermediate_size), 

 "blk.{bid}.ffn_up.{xid}": 

 ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", 

 config.intermediate_size), 

 "blk.{bid}.ffn_down.{xid}": 

 ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", 

 config.hidden_size), 

 "blk.{bid}.ffn_gate.{xid}": 

 ("model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", 

 config.intermediate_size), 

 "blk.{bid}.ffn_gate_inp": ("model.layers.{bid}.block_sparse_moe.gate", 

 config.num_local_experts if hasattr( 

 config, "num_local_experts") else -1), 

 } 

 mapping = {} 

 # This is how llama.cpp handles name mapping, 

 # it's better to use regex match instead doe 

 max_block_num = 200 

 max_expert_num = 8 

 for k, v in tensor_mapping.items(): 

 for i in range(max_block_num): 

 for j in range(max_expert_num): 

 fk = k.format(bid=i, xid=j) 

 fv = v[0].format(bid=i, xid=j) 

 if k not in mapping: 

 mapping[fk] = (fv, v[1]) 

 state_dict = {} 

 with get_loading_progress_bar() as progress: 

 task = progress.add_task("[cyan]Converting GGUF tensors to PyTorch...", 

 total=len(result.tensors)) 

 for ts in result.tensors: 

 weight_type = torch.tensor(int(ts.tensor_type), dtype=torch.int) 

 layer, suffix = ts.name.rsplit(".", 1) 

 new_key, output_dim = mapping[layer] 

 new_key += f".{suffix}" 

 data = torch.tensor(ts.data) 

 if output_dim != -1: 

 data = data.view(output_dim, -1) 

 if weight_type > 1: 

 state_dict[new_key.replace("weight", 

 "weight_type")] = weight_type 

 state_dict[new_key] = data 

 progress.update(task, advance=1) 

 return state_dict

from aphrodite-engine.

AlpinDale commented on June 11, 2024

I will take a closer look, but FYI, exl2 quants do not work with multi-gpu setups. It's the only quant with that limitation.

from aphrodite-engine.

sorasoras commented on June 11, 2024

I will take a closer look, but FYI, exl2 quants do not work with multi-gpu setups. It's the only quant with that limitation.

It's single P40 setup inside WSL2, so I don't know why it value error like that.

from aphrodite-engine.

AlpinDale commented on June 11, 2024

That would be the -tp 2 in your command. Please see here for a full list of the commands and what they do.

from aphrodite-engine.

sorasoras commented on June 11, 2024

That would be the -tp 2 in your command. Please see here for a full list of the commands and what they do.

python -m aphrodite.endpoints.openai.api_server --model /mnt/c/model/sparsetral-16x7B-v2-SPIN_iter1-exl2-6.5/ -tp 1 --api-keys sk-example --trust-remote-code --dtype float32 --kv-cache-dtype fp8_e5m2
You are using a model of type sparsetral to instantiate a model of type mistral. This is not supported for all configurations of models and can yield errors.
INFO:     CUDA_HOME is not found in the environment. Using /usr/local/cuda as CUDA_HOME.
INFO:     Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO:     Initializing the Aphrodite Engine (v0.5.1) with the following config:
INFO:     Model = '/mnt/c/model/sparsetral-16x7B-v2-SPIN_iter1-exl2-6.5/'
INFO:     DataType = torch.float32
INFO:     Model Load Format = auto
INFO:     Number of GPUs = 1
INFO:     Disable Custom All-Reduce = False
INFO:     Quantization Format = None
INFO:     Context Length = 32768
INFO:     Enforce Eager Mode = False
INFO:     KV Cache Data Type = fp8_e5m2
INFO:     KV Cache Params Path = None
INFO:     Device = cuda
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/endpoints/openai/api_server.py", line 563, in <module>
    engine = AsyncAphrodite.from_engine_args(engine_args)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/async_aphrodite.py", line 676, in from_engine_args
    engine = cls(parallel_config.worker_use_ray,
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/async_aphrodite.py", line 341, in __init__
    self.engine = self._init_engine(*args, **kwargs)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/async_aphrodite.py", line 410, in _init_engine
    return engine_class(*args, **kwargs)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/aphrodite_engine.py", line 115, in __init__
    self._init_workers()
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/aphrodite_engine.py", line 157, in _init_workers
    self._run_workers("load_model")
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/aphrodite_engine.py", line 1028, in _run_workers
    driver_worker_output = getattr(self.driver_worker,
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/task_handler/worker.py", line 112, in load_model
    self.model_runner.load_model()
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/task_handler/model_runner.py", line 121, in load_model
    self.model = get_model(self.model_config, self.device_config,
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/modeling/loader.py", line 47, in get_model
    model_class = _get_model_architecture(model_config)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/modeling/loader.py", line 39, in _get_model_architecture
    raise ValueError(
ValueError: Model architectures ['modeling_sparsetral.MistralForCausalLM'] are not supported for now. Supported architectures: ['AquilaModel', 'AquilaForCausalLM', 'BaiChuanForCausalLM', 'BaichuanForCausalLM', 'BloomForCausalLM', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 'DeciLMForCausalLM', 'DeepseekForCausalLM', 'FalconForCausalLM', 'GemmaForCausalLM', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTJForCausalLM', 'GPTNeoXForCausalLM', 'InternLMForCausalLM', 'InternLM2ForCausalLM', 'LlamaForCausalLM', 'LLaMAForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'QuantMixtralForCausalLM', 'MptForCausalLM', 'MPTForCausalLM', 'OLMoForCausalLM', 'OPTForCausalLM', 'PhiForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'RWForCausalLM', 'StableLMEpochForCausalLM', 'StableLmForCausalLM']

I think I got it working but 'modeling_sparsetral.MistralForCausalLM'] are not supported for now

from aphrodite-engine.

AlpinDale commented on June 11, 2024

You can probably remove the modeling_sparsetral part from the model's config.json, it may work, but it'll skip all the MoE stuff. Same is happening with that exl2 quant I imagine, because exl2 doesn't support this arch.

from aphrodite-engine.

sorasoras commented on June 11, 2024

You can probably remove the modeling_sparsetral part from the model's config.json, it may work, but it'll skip all the MoE stuff. Same is happening with that exl2 quant I imagine, because exl2 doesn't support this arch.

I have other question regarding qwen1.5/qwen1 in general.
I try to load a qwen1.5 or 1 directly via load with
python -m aphrodite.endpoints.openai.api_server --model sakura0.9_13B_Qwen1.5_Q5KS_1.2.gguf -tp 1 --api-keys sk-example
I got

Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/endpoints/openai/api_server.py", line 563, in <module>
    engine = AsyncAphrodite.from_engine_args(engine_args)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/async_aphrodite.py", line 670, in from_engine_args
    engine_configs = engine_args.create_engine_configs()
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/engine/args_tools.py", line 318, in create_engine_configs
    model_config = ModelConfig(
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/common/config.py", line 116, in __init__
    self.hf_config = get_config(self.model, trust_remote_code, revision)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/transformers_utils/config.py", line 86, in get_config
    return extract_gguf_config(model)
  File "/home/sora/.local/lib/python3.10/site-packages/aphrodite/transformers_utils/config.py", line 28, in extract_gguf_config
    raise RuntimeError(f"Unsupported architecture {architecture}")
RuntimeError: Unsupported architecture qwen2

I guess i need to convert to PTH before using it？

from aphrodite-engine.

AlpinDale commented on June 11, 2024

Works fine with the FP16 model. Can you link me to the gguf if it's public?

from aphrodite-engine.

sorasoras commented on June 11, 2024

Works fine with the FP16 model. Can you link me to the gguf if it's public?

https://huggingface.co/shing3232/Sakura13B-LNovel-v0.9-qwen1.5-GGUF-IMX/blob/main/sakura0.9_13B_Qwen1.5_Q5KS_1.2.gguf

from aphrodite-engine.

sorasoras commented on June 11, 2024

Ah I see what the issue is.

We're using a custom GGUF model parser in aphrodite, so it means everything needs to be hand-written and implemented for every model arch. Llama, mistral, et al. fall under the llama category of models in llama.cpp, so their tensors and configs match with every llama model. Models like qwen2, command-r, etc, are supported by llama.cpp but use different names for tensors. To add support for these, we'd have to handle every model individually. I haven't gotten around to doing it yet, it'd need a fair bit of work. If you (or anyone else) would like to contribute for that, I'd start looking at these two places:

aphrodite-engine/aphrodite/transformers_utils/tokenizer.py

Lines 16 to 65 in ed225f5

def convert_gguf_to_tokenizer(checkpoint):

result = GGUFReader(checkpoint)

# write vocab

sentencepiece_model_pb2 = import_protobuf()

vocab = sentencepiece_model_pb2.ModelProto()

vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)

vocab.trainer_spec.model_type = 2 # BPE

vocab.trainer_spec.vocab_size = vocab_size

vocab.trainer_spec.byte_fallback = True

vocab.normalizer_spec.remove_extra_whitespaces = False

tokens = result.fields['tokenizer.ggml.tokens']

scores = result.fields['tokenizer.ggml.scores']

types = result.fields['tokenizer.ggml.token_type']

for i in range(vocab_size):

new_token = vocab.SentencePiece()

new_token.piece = str(bytes(tokens.parts[tokens.data[i]]),

encoding='utf-8')

new_token.score = scores.parts[scores.data[i]]

# llama.cpp tokentype is the same with sentencepiece token type

new_token.type = int(types.parts[types.data[i]])

vocab.pieces.append(new_token)

with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:

temp_file.write(vocab.SerializeToString())

temp_file_filename = temp_file.name

tokenizer_args = {"vocab_file": temp_file_filename}

if 'tokenizer.ggml.bos_token_id' in result.fields:

tokenizer_args["bos_token"] = vocab.pieces[int(

result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece

if 'tokenizer.ggml.eos_token_id' in result.fields:

tokenizer_args["eos_token"] = vocab.pieces[int(

result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece

if 'tokenizer.ggml.padding_token_id' in result.fields:

tokenizer_args["pad_token"] = vocab.pieces[int(

result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece

if 'tokenizer.ggml.unknown_token_id' in result.fields:

tokenizer_args["unk_token"] = vocab.pieces[int(

result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece

if 'tokenizer.ggml.add_bos_token' in result.fields:

tokenizer_args["add_bos_token"] = bool(

result.fields['tokenizer.ggml.add_bos_token'].parts[-1])

if 'tokenizer.ggml.add_eos_token' in result.fields:

tokenizer_args["add_eos_token"] = bool(

result.fields['tokenizer.ggml.add_eos_token'].parts[-1])

if 'tokenizer.chat_template' in result.fields:

tokenizer_args["chat_template"] = str(

bytes(result.fields['tokenizer.chat_template'].parts[-1]))

tokenizer = LlamaTokenizer(**tokenizer_args)

os.unlink(temp_file_filename)

return tokenizer

aphrodite-engine/aphrodite/modeling/hf_downloader.py

Lines 208 to 281 in ed225f5

def convert_gguf_to_state_dict(checkpoint, config):

if not os.path.isfile(checkpoint):

raise RuntimeError(

f"Cannot find any model weights with `{checkpoint}`")

result = GGUFReader(checkpoint)

# write tensor

kv_dim = (config.hidden_size // config.num_attention_heads *

config.num_key_value_heads)

tensor_mapping = {

"token_embd": ("model.embed_tokens", config.vocab_size),

"output": ("lm_head", config.vocab_size),

"output_norm": ("model.norm", -1),

"blk.{bid}.attn_norm": ("model.layers.{bid}.input_layernorm", -1),

"blk.{bid}.attn_q": ("model.layers.{bid}.self_attn.q_proj",

config.hidden_size),

"blk.{bid}.attn_k": ("model.layers.{bid}.self_attn.k_proj", kv_dim),

"blk.{bid}.attn_v": ("model.layers.{bid}.self_attn.v_proj", kv_dim),

"blk.{bid}.attn_output": ("model.layers.{bid}.self_attn.o_proj",

config.hidden_size),

"blk.{bid}.attn_rot_embd":

("model.layers.{bid}.self_attn.rotary_emb.inv_freq", -1),

"blk.{bid}.ffn_norm": ("model.layers.{bid}.post_attention_layernorm",

-1),

"blk.{bid}.ffn_up": ("model.layers.{bid}.mlp.up_proj",

config.intermediate_size),

"blk.{bid}.ffn_down": ("model.layers.{bid}.mlp.down_proj",

config.hidden_size),

"blk.{bid}.ffn_gate": ("model.layers.{bid}.mlp.gate_proj",

config.intermediate_size),

"blk.{bid}.ffn_up.{xid}":

("model.layers.{bid}.block_sparse_moe.experts.{xid}.w3",

config.intermediate_size),

"blk.{bid}.ffn_down.{xid}":

("model.layers.{bid}.block_sparse_moe.experts.{xid}.w2",

config.hidden_size),

"blk.{bid}.ffn_gate.{xid}":

("model.layers.{bid}.block_sparse_moe.experts.{xid}.w1",

config.intermediate_size),

"blk.{bid}.ffn_gate_inp": ("model.layers.{bid}.block_sparse_moe.gate",

config.num_local_experts if hasattr(

config, "num_local_experts") else -1),

}

mapping = {}

# This is how llama.cpp handles name mapping,

# it's better to use regex match instead doe

max_block_num = 200

max_expert_num = 8

for k, v in tensor_mapping.items():

for i in range(max_block_num):

for j in range(max_expert_num):

fk = k.format(bid=i, xid=j)

fv = v[0].format(bid=i, xid=j)

if k not in mapping:

mapping[fk] = (fv, v[1])

state_dict = {}

with get_loading_progress_bar() as progress:

task = progress.add_task("[cyan]Converting GGUF tensors to PyTorch...",

total=len(result.tensors))

for ts in result.tensors:

weight_type = torch.tensor(int(ts.tensor_type), dtype=torch.int)

layer, suffix = ts.name.rsplit(".", 1)

new_key, output_dim = mapping[layer]

new_key += f".{suffix}"

data = torch.tensor(ts.data)

if output_dim != -1:

data = data.view(output_dim, -1)

if weight_type > 1:

state_dict[new_key.replace("weight",

"weight_type")] = weight_type

state_dict[new_key] = data

progress.update(task, advance=1)

return state_dict

I could not offer much help regarding coding, but I though if this could done in reverse to
https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py
this script convert hf to gguf but what if this can be done in revere.
Anyway, thanks for the hard work.

from aphrodite-engine.

[sparsetral and Qwen2idae]: support for mixtral of lora about aphrodite-engine HOT 12 OPEN

Comments (12)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

	def convert_gguf_to_tokenizer(checkpoint):
	result = GGUFReader(checkpoint)
	# write vocab
	sentencepiece_model_pb2 = import_protobuf()
	vocab = sentencepiece_model_pb2.ModelProto()
	vocab_size = len(result.fields['tokenizer.ggml.token_type'].data)
	vocab.trainer_spec.model_type = 2 # BPE
	vocab.trainer_spec.vocab_size = vocab_size
	vocab.trainer_spec.byte_fallback = True
	vocab.normalizer_spec.remove_extra_whitespaces = False
	tokens = result.fields['tokenizer.ggml.tokens']
	scores = result.fields['tokenizer.ggml.scores']
	types = result.fields['tokenizer.ggml.token_type']
	for i in range(vocab_size):
	new_token = vocab.SentencePiece()
	new_token.piece = str(bytes(tokens.parts[tokens.data[i]]),
	encoding='utf-8')
	new_token.score = scores.parts[scores.data[i]]
	# llama.cpp tokentype is the same with sentencepiece token type
	new_token.type = int(types.parts[types.data[i]])
	vocab.pieces.append(new_token)
	with tempfile.NamedTemporaryFile(mode='wb', delete=False) as temp_file:
	temp_file.write(vocab.SerializeToString())
	temp_file_filename = temp_file.name
	tokenizer_args = {"vocab_file": temp_file_filename}

	if 'tokenizer.ggml.bos_token_id' in result.fields:
	tokenizer_args["bos_token"] = vocab.pieces[int(
	result.fields['tokenizer.ggml.bos_token_id'].parts[-1])].piece
	if 'tokenizer.ggml.eos_token_id' in result.fields:
	tokenizer_args["eos_token"] = vocab.pieces[int(
	result.fields['tokenizer.ggml.eos_token_id'].parts[-1])].piece
	if 'tokenizer.ggml.padding_token_id' in result.fields:
	tokenizer_args["pad_token"] = vocab.pieces[int(
	result.fields['tokenizer.ggml.padding_token_id'].parts[-1])].piece
	if 'tokenizer.ggml.unknown_token_id' in result.fields:
	tokenizer_args["unk_token"] = vocab.pieces[int(
	result.fields['tokenizer.ggml.unknown_token_id'].parts[-1])].piece
	if 'tokenizer.ggml.add_bos_token' in result.fields:
	tokenizer_args["add_bos_token"] = bool(
	result.fields['tokenizer.ggml.add_bos_token'].parts[-1])
	if 'tokenizer.ggml.add_eos_token' in result.fields:
	tokenizer_args["add_eos_token"] = bool(
	result.fields['tokenizer.ggml.add_eos_token'].parts[-1])
	if 'tokenizer.chat_template' in result.fields:
	tokenizer_args["chat_template"] = str(
	bytes(result.fields['tokenizer.chat_template'].parts[-1]))
	tokenizer = LlamaTokenizer(**tokenizer_args)
	os.unlink(temp_file_filename)
	return tokenizer

	def convert_gguf_to_state_dict(checkpoint, config):
	if not os.path.isfile(checkpoint):
	raise RuntimeError(
	f"Cannot find any model weights with `{checkpoint}`")

	result = GGUFReader(checkpoint)
	# write tensor
	kv_dim = (config.hidden_size // config.num_attention_heads *
	config.num_key_value_heads)
	tensor_mapping = {
	"token_embd": ("model.embed_tokens", config.vocab_size),
	"output": ("lm_head", config.vocab_size),
	"output_norm": ("model.norm", -1),
	"blk.{bid}.attn_norm": ("model.layers.{bid}.input_layernorm", -1),
	"blk.{bid}.attn_q": ("model.layers.{bid}.self_attn.q_proj",
	config.hidden_size),
	"blk.{bid}.attn_k": ("model.layers.{bid}.self_attn.k_proj", kv_dim),
	"blk.{bid}.attn_v": ("model.layers.{bid}.self_attn.v_proj", kv_dim),
	"blk.{bid}.attn_output": ("model.layers.{bid}.self_attn.o_proj",
	config.hidden_size),
	"blk.{bid}.attn_rot_embd":
	("model.layers.{bid}.self_attn.rotary_emb.inv_freq", -1),
	"blk.{bid}.ffn_norm": ("model.layers.{bid}.post_attention_layernorm",
	-1),
	"blk.{bid}.ffn_up": ("model.layers.{bid}.mlp.up_proj",
	config.intermediate_size),
	"blk.{bid}.ffn_down": ("model.layers.{bid}.mlp.down_proj",
	config.hidden_size),
	"blk.{bid}.ffn_gate": ("model.layers.{bid}.mlp.gate_proj",
	config.intermediate_size),
	"blk.{bid}.ffn_up.{xid}":
	("model.layers.{bid}.block_sparse_moe.experts.{xid}.w3",
	config.intermediate_size),
	"blk.{bid}.ffn_down.{xid}":
	("model.layers.{bid}.block_sparse_moe.experts.{xid}.w2",
	config.hidden_size),
	"blk.{bid}.ffn_gate.{xid}":
	("model.layers.{bid}.block_sparse_moe.experts.{xid}.w1",
	config.intermediate_size),
	"blk.{bid}.ffn_gate_inp": ("model.layers.{bid}.block_sparse_moe.gate",
	config.num_local_experts if hasattr(
	config, "num_local_experts") else -1),
	}
	mapping = {}
	# This is how llama.cpp handles name mapping,
	# it's better to use regex match instead doe
	max_block_num = 200
	max_expert_num = 8
	for k, v in tensor_mapping.items():
	for i in range(max_block_num):
	for j in range(max_expert_num):
	fk = k.format(bid=i, xid=j)
	fv = v[0].format(bid=i, xid=j)
	if k not in mapping:
	mapping[fk] = (fv, v[1])

	state_dict = {}
	with get_loading_progress_bar() as progress:
	task = progress.add_task("[cyan]Converting GGUF tensors to PyTorch...",
	total=len(result.tensors))
	for ts in result.tensors:
	weight_type = torch.tensor(int(ts.tensor_type), dtype=torch.int)
	layer, suffix = ts.name.rsplit(".", 1)
	new_key, output_dim = mapping[layer]
	new_key += f".{suffix}"
	data = torch.tensor(ts.data)
	if output_dim != -1:
	data = data.view(output_dim, -1)
	if weight_type > 1:
	state_dict[new_key.replace("weight",
	"weight_type")] = weight_type
	state_dict[new_key] = data
	progress.update(task, advance=1)
	return state_dict