diff --git a/.gitignore b/.gitignore index 60afa89c27..ccfad036dc 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ htmlcov/ *build*/ !builder/ lmdeploy/lib/ +lmdeploy/bin/ dist/ examples/cpp/llama/*.csv *.npy diff --git a/MANIFEST.in b/MANIFEST.in index 6f61160e4f..ab9b0b57bf 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,5 +3,6 @@ include lmdeploy/lib/*.so include lmdeploy/lib/*.so* include lmdeploy/lib/*.dll include lmdeploy/lib/*.pyd +include lmdeploy/bin/* include lmdeploy/serve/turbomind/service_docker_up.sh recursive-include lmdeploy/serve/turbomind/triton_models * diff --git a/examples/cpp/llama/generate_gemm_config.py b/examples/cpp/llama/generate_gemm_config.py deleted file mode 100644 index ad53e312fd..0000000000 --- a/examples/cpp/llama/generate_gemm_config.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. - -import subprocess - -import fire - - -def main(head_num: int = 32, - size_per_head: int = 128, - vocab_size: int = 32000, - inter_size: int = 11008, - tensor_para_size: int = 1, - max_batch_size: int = 64): - for bsz in range(1, max_batch_size + 1): - subprocess.call( - f'bin/llama_gemm {bsz} 1 1 {head_num} {size_per_head} {inter_size}' - f' {vocab_size} 1 {tensor_para_size} {0 if bsz == 1 else 1}', - shell=True) - - -if __name__ == '__main__': - fire.Fire(main) diff --git a/lmdeploy/turbomind/generate_gemm_config.py b/lmdeploy/turbomind/generate_gemm_config.py new file mode 100644 index 0000000000..328f182158 --- /dev/null +++ b/lmdeploy/turbomind/generate_gemm_config.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import subprocess + +import fire + + +def get_llama_gemm(): + import os.path as osp + + import lmdeploy + lmdeploy_dir = osp.split(lmdeploy.__file__)[0] + bin_path = osp.join(lmdeploy_dir, 'bin', 'llama_gemm') + assert osp.exists(bin_path), f'{bin_path} not exists' + return bin_path + + +def main(head_num: int = 32, + size_per_head: int = 128, + vocab_size: int = 32000, + inter_size: int = 11008, + tensor_para_size: int = 1, + max_batch_size: int = 64): + for bsz in range(1, max_batch_size + 1): + subprocess.call( + f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}' + f' {inter_size} {vocab_size} 1 {tensor_para_size}' + f' {0 if bsz == 1 else 1}', + shell=True) + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/setup.py b/setup.py index fab3002655..09ae1e31c2 100644 --- a/setup.py +++ b/setup.py @@ -120,6 +120,7 @@ def gen_packages_items(): if __name__ == '__main__': + lmdeploy_package_data = ['lmdeploy/bin/llama_gemm'] setup(name='lmdeploy', version=get_version(), description='A toolset for compressing, deploying and serving LLM', @@ -128,6 +129,9 @@ def gen_packages_items(): author='OpenMMLab', author_email='openmmlab@gmail.com', packages=find_packages(exclude=()), + package_data={ + 'lmdeploy': lmdeploy_package_data, + }, include_package_data=True, install_requires=parse_requirements('requirements.txt'), has_ext_modules=check_ext_modules, diff --git a/src/turbomind/models/llama/CMakeLists.txt b/src/turbomind/models/llama/CMakeLists.txt index d7751e7d63..10b93fb9ec 100644 --- a/src/turbomind/models/llama/CMakeLists.txt +++ b/src/turbomind/models/llama/CMakeLists.txt @@ -48,3 +48,4 @@ endif() add_executable(llama_gemm llama_gemm.cc) target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger) +install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)