包含以下内容:
- embedding_f32_kernel
- embedding_f32x4_kernel(float4向量化版本)
- embedding_f32x4_pack_kernel(float4向量化,pack版本版本)
- embedding_f16_kernel(fp16版本)
- embedding_f16x8_kernel(fp16向量化版本)
- embedding_f16x8_pack_kernel(fp16向量化,pack版本)
- PyTorch bindings
# 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
export TORCH_CUDA_ARCH_LIST=Ada
python3 embedding.py
一个elemwise的操作,但是有个值得探究的问题,在f16下pack的性能优于没有pack的性能但是在f32下相反 输出:
--------------------------------------------------------------------------------------------------------------
MaxV=1024, SeqLen=2048, EmbSize=512
out_f32: ['0.69075936 ', '0.56517494 ', '-0.12546943 '], time:0.005317ms
out_f32x4: ['0.69075936 ', '0.56517494 ', '-0.12546943 '], time:0.004125ms
out_f32x4_pack: ['0.69075936 ', '0.56517494 ', '-0.12546943 '], time:0.004017ms
out_f32_th: ['0.69075936 ', '0.56517494 ', '-0.12546943 '], time:0.012147ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['-1.27734375 ', '-0.92822266 ', '-1.4453125 '], time:0.005090ms
out_f16x8: ['-1.27734375 ', '-0.92822266 ', '-1.4453125 '], time:0.004089ms
out_f16x8_pack: ['-1.27734375 ', '-0.92822266 ', '-1.4453125 '], time:0.004041ms
out_f16_th: ['-1.27734375 ', '-0.92822266 ', '-1.4453125 '], time:0.011230ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=1024, SeqLen=2048, EmbSize=1024
out_f32: ['-1.34922504 ', '-0.04674992 ', '1.24448562 '], time:0.011468ms
out_f32x4: ['-1.34922504 ', '-0.04674992 ', '1.24448562 '], time:0.005364ms
out_f32x4_pack: ['-1.34922504 ', '-0.04674992 ', '1.24448562 '], time:0.005448ms
out_f32_th: ['-1.34922504 ', '-0.04674992 ', '1.24448562 '], time:0.037062ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['-0.47412109 ', '-0.47070312 ', '1.41894531 '], time:0.011039ms
out_f16x8: ['-0.47412109 ', '-0.47070312 ', '1.41894531 '], time:0.004971ms
out_f16x8_pack: ['-0.47412109 ', '-0.47070312 ', '1.41894531 '], time:0.004065ms
out_f16_th: ['-0.47412109 ', '-0.47070312 ', '1.41894531 '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=1024, SeqLen=4096, EmbSize=512
out_f32: ['0.30875409 ', '0.98055625 ', '-0.86661887 '], time:0.008464ms
out_f32x4: ['0.30875409 ', '0.98055625 ', '-0.86661887 '], time:0.005400ms
out_f32x4_pack: ['0.30875409 ', '0.98055625 ', '-0.86661887 '], time:0.005484ms
out_f32_th: ['0.30875409 ', '0.98055625 ', '-0.86661887 '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['-1.04492188 ', '0.44750977 ', '-1.25878906 '], time:0.008070ms
out_f16x8: ['-1.04492188 ', '0.44750977 ', '-1.25878906 '], time:0.005186ms
out_f16x8_pack: ['-1.04492188 ', '0.44750977 ', '-1.25878906 '], time:0.004339ms
out_f16_th: ['-1.04492188 ', '0.44750977 ', '-1.25878906 '], time:0.016415ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=1024, SeqLen=4096, EmbSize=1024
out_f32: ['-0.07979927 ', '-0.20634571 ', '0.9166832 '], time:0.021017ms
out_f32x4: ['-0.07979927 ', '-0.20634571 ', '0.9166832 '], time:0.008297ms
out_f32x4_pack: ['-0.07979927 ', '-0.20634571 ', '0.9166832 '], time:0.008714ms
out_f32_th: ['-0.07979927 ', '-0.20634571 ', '0.9166832 '], time:0.037730ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['-2.19726562 ', '0.50439453 ', '1.40917969 '], time:0.020158ms
out_f16x8: ['-2.19726562 ', '0.50439453 ', '1.40917969 '], time:0.007451ms
out_f16x8_pack: ['-2.19726562 ', '0.50439453 ', '1.40917969 '], time:0.005496ms
out_f16_th: ['-2.19726562 ', '0.50439453 ', '1.40917969 '], time:0.030172ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=4096, SeqLen=2048, EmbSize=512
out_f32: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.005329ms
out_f32x4: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.004041ms
out_f32x4_pack: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.004005ms
out_f32_th: ['-0.13596091 ', '-0.07996719 ', '-2.77454519 '], time:0.011110ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['0.14440918 ', '0.26367188 ', '0.77539062 '], time:0.005066ms
out_f16x8: ['0.14440918 ', '0.26367188 ', '0.77539062 '], time:0.004041ms
out_f16x8_pack: ['0.14440918 ', '0.26367188 ', '0.77539062 '], time:0.003982ms
out_f16_th: ['0.14440918 ', '0.26367188 ', '0.77539062 '], time:0.011170ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=4096, SeqLen=2048, EmbSize=1024
out_f32: ['1.26920044 ', '0.12124556 ', '0.38764721 '], time:0.011575ms
out_f32x4: ['1.26920044 ', '0.12124556 ', '0.38764721 '], time:0.005329ms
out_f32x4_pack: ['1.26920044 ', '0.12124556 ', '0.38764721 '], time:0.005519ms
out_f32_th: ['1.26920044 ', '0.12124556 ', '0.38764721 '], time:0.016499ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['0.49243164 ', '-0.44116211 ', '-0.14611816 '], time:0.011182ms
out_f16x8: ['0.49243164 ', '-0.44116211 ', '-0.14611816 '], time:0.004947ms
out_f16x8_pack: ['0.49243164 ', '-0.44116211 ', '-0.14611816 '], time:0.004029ms
out_f16_th: ['0.49243164 ', '-0.44116211 ', '-0.14611816 '], time:0.016606ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=4096, SeqLen=4096, EmbSize=512
out_f32: ['-1.29970169 ', '-1.45127702 ', '-0.7259807 '], time:0.008452ms
out_f32x4: ['-1.29970169 ', '-1.45127702 ', '-0.7259807 '], time:0.005424ms
out_f32x4_pack: ['-1.29970169 ', '-1.45127702 ', '-0.7259807 '], time:0.005460ms
out_f32_th: ['-1.29970169 ', '-1.45127702 ', '-0.7259807 '], time:0.016463ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['0.88623047 ', '0.00491714 ', '0.38525391 '], time:0.008094ms
out_f16x8: ['0.88623047 ', '0.00491714 ', '0.38525391 '], time:0.005186ms
out_f16x8_pack: ['0.88623047 ', '0.00491714 ', '0.38525391 '], time:0.004339ms
out_f16_th: ['0.88623047 ', '0.00491714 ', '0.38525391 '], time:0.016451ms
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
MaxV=4096, SeqLen=4096, EmbSize=1024
out_f32: ['0.62899369 ', '0.55146962 ', '-0.03596229 '], time:0.021005ms
out_f32x4: ['0.62899369 ', '0.55146962 ', '-0.03596229 '], time:0.008404ms
out_f32x4_pack: ['0.62899369 ', '0.55146962 ', '-0.03596229 '], time:0.008738ms
out_f32_th: ['0.62899369 ', '0.55146962 ', '-0.03596229 '], time:0.056100ms
--------------------------------------------------------------------------------------------------------------
out_f16: ['1.13085938 ', '1.19628906 ', '-0.61035156 '], time:0.020254ms
out_f16x8: ['1.13085938 ', '1.19628906 ', '-0.61035156 '], time:0.007451ms
out_f16x8_pack: ['1.13085938 ', '1.19628906 ', '-0.61035156 '], time:0.005472ms
out_f16_th: ['1.13085938 ', '1.19628906 ', '-0.61035156 '], time:0.030160ms
--------------------------------------------------------------------------------------------------------------