Comments (6)
yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:
import torch
import os
import json
import fire
import shutil
from safetensors.torch import load_file, save_file
def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
prefix, suffix = name.split('qkv_proj')
res = {}
for t in ['q_proj', 'k_proj', 'v_proj']:
name = f"{prefix}{t}{suffix}"
res[name] = weight.clone()
return res
def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
prefix, suffix = name.split('gate_up_proj')
res = {}
for t in ['gate_proj', 'up_proj']:
name = f"{prefix}{t}{suffix}"
res[name] = weight.clone()
return res
def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
size = weight.shape[0] // 3
prefix, suffix = name.split('qkv_proj')
res = {
f"{prefix}{t}{suffix}": w
for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size))
}
return res
def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
size = weight.shape[0] // 2
prefix, suffix = name.split('gate_up_proj')
res = {
f"{prefix}{t}{suffix}": w
for t, w in zip(['gate_proj', 'up_proj'], weight.split(size))
}
return res
def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None:
"""return the new adapter dict"""
adapter_bin_name = 'adapter_model.safetensors'
adapter_config_name = 'adapter_config.json'
lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}")
with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f:
lora_config = json.load(f)
assert 'qkv_proj' in lora_config['target_modules']
assert 'gate_up_proj' in lora_config['target_modules']
# converting weights
res = {}
for k, v in lora.items():
if 'qkv_proj' in k and 'lora_A' in k:
res.update(replicate_lora_a_qkv(k, v))
elif 'qkv_proj' in k and 'lora_B' in k:
res.update(split_lora_b_qkv(k, v))
elif 'gate_up_proj' in k and 'lora_A' in k:
res.update(replicate_lora_a_gate_up(k, v))
elif 'gate_up_proj' in k and 'lora_B' in k:
res.update(split_lora_b_gate_up(k, v))
else:
res[k] = v
# converting config
temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj']
lora_config['target_modules'] = temp
# saving
os.makedirs(output_folder_path, exist_ok=True)
save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"})
with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f:
json.dump(lora_config, f, indent=4)
for file in os.listdir(adapter_folder_path):
if file != adapter_bin_name and file != adapter_config_name:
shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}")
if __name__ == "__main__":
fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)
It can load it without error, but not sure if there will be any performance issue.
from vllm.
I added the new projection layers using the conversion code above and the standard vLLM code for LoRA using snapshot_download using their documentation (https://docs.vllm.ai/en/latest/models/lora.html#using-lora-adapters).
It did work without errors, though performance dropped massively compared to the same model (with merged LoRA) without vLLM. I used the same tokenizer for both models.
I have not found out yet why this is the case, might just be a bug in my own code somewhere
Update: There was a bug in my code, the script above works just fine and the LoRa weights are merged well
from vllm.
Thanks a lot. I will use this, but will VLLM be adding support for this in the future?
from vllm.
yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:
import torch import os import json import fire import shutil from safetensors.torch import load_file, save_file def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: prefix, suffix = name.split('qkv_proj') res = {} for t in ['q_proj', 'k_proj', 'v_proj']: name = f"{prefix}{t}{suffix}" res[name] = weight.clone() return res def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: prefix, suffix = name.split('gate_up_proj') res = {} for t in ['gate_proj', 'up_proj']: name = f"{prefix}{t}{suffix}" res[name] = weight.clone() return res def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: size = weight.shape[0] // 3 prefix, suffix = name.split('qkv_proj') res = { f"{prefix}{t}{suffix}": w for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size)) } return res def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: size = weight.shape[0] // 2 prefix, suffix = name.split('gate_up_proj') res = { f"{prefix}{t}{suffix}": w for t, w in zip(['gate_proj', 'up_proj'], weight.split(size)) } return res def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None: """return the new adapter dict""" adapter_bin_name = 'adapter_model.safetensors' adapter_config_name = 'adapter_config.json' lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}") with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f: lora_config = json.load(f) assert 'qkv_proj' in lora_config['target_modules'] assert 'gate_up_proj' in lora_config['target_modules'] # converting weights res = {} for k, v in lora.items(): if 'qkv_proj' in k and 'lora_A' in k: res.update(replicate_lora_a_qkv(k, v)) elif 'qkv_proj' in k and 'lora_B' in k: res.update(split_lora_b_qkv(k, v)) elif 'gate_up_proj' in k and 'lora_A' in k: res.update(replicate_lora_a_gate_up(k, v)) elif 'gate_up_proj' in k and 'lora_B' in k: res.update(split_lora_b_gate_up(k, v)) else: res[k] = v # converting config temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj'] lora_config['target_modules'] = temp # saving os.makedirs(output_folder_path, exist_ok=True) save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"}) with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f: json.dump(lora_config, f, indent=4) for file in os.listdir(adapter_folder_path): if file != adapter_bin_name and file != adapter_config_name: shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}") if __name__ == "__main__": fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)
It can load it without error, but not sure if there will be any performance issue.
After decomposing layers of Phi3, will there be any problem merging LoRA layers back into Phi3? Do we have to merge the decomposed layers back before append LoRA back onto the original Phi3?
from vllm.
Thanks a lot. I will use this, but will VLLM be adding support for this in the future?
I guess they will, otherwise we will always need to convert the lora first, which is not very convenient.
from vllm.
yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:
import torch import os import json import fire import shutil from safetensors.torch import load_file, save_file def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: prefix, suffix = name.split('qkv_proj') res = {} for t in ['q_proj', 'k_proj', 'v_proj']: name = f"{prefix}{t}{suffix}" res[name] = weight.clone() return res def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: prefix, suffix = name.split('gate_up_proj') res = {} for t in ['gate_proj', 'up_proj']: name = f"{prefix}{t}{suffix}" res[name] = weight.clone() return res def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: size = weight.shape[0] // 3 prefix, suffix = name.split('qkv_proj') res = { f"{prefix}{t}{suffix}": w for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size)) } return res def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]: size = weight.shape[0] // 2 prefix, suffix = name.split('gate_up_proj') res = { f"{prefix}{t}{suffix}": w for t, w in zip(['gate_proj', 'up_proj'], weight.split(size)) } return res def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None: """return the new adapter dict""" adapter_bin_name = 'adapter_model.safetensors' adapter_config_name = 'adapter_config.json' lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}") with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f: lora_config = json.load(f) assert 'qkv_proj' in lora_config['target_modules'] assert 'gate_up_proj' in lora_config['target_modules'] # converting weights res = {} for k, v in lora.items(): if 'qkv_proj' in k and 'lora_A' in k: res.update(replicate_lora_a_qkv(k, v)) elif 'qkv_proj' in k and 'lora_B' in k: res.update(split_lora_b_qkv(k, v)) elif 'gate_up_proj' in k and 'lora_A' in k: res.update(replicate_lora_a_gate_up(k, v)) elif 'gate_up_proj' in k and 'lora_B' in k: res.update(split_lora_b_gate_up(k, v)) else: res[k] = v # converting config temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj'] lora_config['target_modules'] = temp # saving os.makedirs(output_folder_path, exist_ok=True) save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"}) with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f: json.dump(lora_config, f, indent=4) for file in os.listdir(adapter_folder_path): if file != adapter_bin_name and file != adapter_config_name: shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}") if __name__ == "__main__": fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)
It can load it without error, but not sure if there will be any performance issue.
After decomposing layers of Phi3, will there be any problem merging LoRA layers back into Phi3? Do we have to merge the decomposed layers back before append LoRA back onto the original Phi3?
I think you can directly merge the original (not decomposed) LORA adapter to Phi3 if you're not trying to load LORA via vllm.
from vllm.
Related Issues (20)
- [Bug]: Excessive Memory Consumption of Cudagraph on A10G/L4 GPUs HOT 3
- [RFC]: Usage Data Enhancement for v0.5.* HOT 2
- [Bug]: Shutdown error when using multiproc_gpu_executor HOT 1
- [Bug]: The speed of loading the qwen2 72b model, glm-4-9b-chat-1m model in v0.5.0 is much lower than that in v0.4.2. HOT 10
- [Bug]: MOE模型,2卡推理,报错AssertionError("Invalid device id") HOT 2
- [Bug]: In vLLM v0.4.3 and later, calling list_loras() in a tensor parallelism situation causes the system to hang. HOT 3
- [Bug]: Very slow execution of from_lora_tensors() when using mp instead of ray as --distributed-executor-backend.
- [Usage]: how to use enable-chunked-prefill? HOT 3
- [Performance]: How use vllm.attention.ops.triton_flash_attention replace flash_attn package HOT 1
- [Bug]: Performance : very slow inference for Mixtral 8x7B Instruct FP8 on H100 with 0.5.0 and 0.5.0.post1 HOT 2
- [Bug]: CUDA illegal memory access error when `enable_prefix_caching=True` HOT 4
- [Bug]: Vllm 0.3.0 got weired output
- [Feature]: LoRA support for Mixtral GPTQ and AWQ HOT 1
- [Feature]: asymmetric tensor parallel
- [Bug]: prefix-caching: inconsistent completions HOT 1
- [Bug]: Distribute Tests PR test fails
- [Bug]: llava-v1.6-mistral-7b-hf prompt template handling error HOT 3
- [Bug]: RuntimeError: CUDA error: no kernel image is available for execution on the device HOT 1
- [Bug]: OOM when setting prompt_logprobs=1 HOT 3
- [RFC]: Refactor Worker and ModelRunner to consolidate control plane communication HOT 10
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from vllm.