diff --git a/404.html b/404.html index af250097..8e1f0aa9 100644 --- a/404.html +++ b/404.html @@ -7,13 +7,13 @@ Page Not Found | MS-AMP - +
Skip to main content

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

- + \ No newline at end of file diff --git a/assets/images/gpt-performance-6ad54753eb117d15e155b4de69d5d788.png b/assets/images/gpt-performance-6ad54753eb117d15e155b4de69d5d788.png new file mode 100644 index 00000000..a54241c4 Binary files /dev/null and b/assets/images/gpt-performance-6ad54753eb117d15e155b4de69d5d788.png differ diff --git a/assets/images/gpt-performance-a7c6ea38d42167c6935652eeb0e2bbb3.png b/assets/images/gpt-performance-a7c6ea38d42167c6935652eeb0e2bbb3.png deleted file mode 100644 index 8269d60d..00000000 Binary files a/assets/images/gpt-performance-a7c6ea38d42167c6935652eeb0e2bbb3.png and /dev/null differ diff --git a/assets/js/53e18611.019b3b8a.js b/assets/js/53e18611.e716735c.js similarity index 98% rename from assets/js/53e18611.019b3b8a.js rename to assets/js/53e18611.e716735c.js index 60bb2607..284063d2 100644 --- a/assets/js/53e18611.019b3b8a.js +++ b/assets/js/53e18611.e716735c.js @@ -1 +1 @@ -"use strict";(self.webpackChunkmsamp_website=self.webpackChunkmsamp_website||[]).push([[349],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return d}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function a(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var c=r.createContext({}),s=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):a(a({},t),e)),n},p=function(e){var t=s(e.components);return r.createElement(c.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,o=e.originalType,c=e.parentName,p=l(e,["components","mdxType","originalType","parentName"]),m=s(n),d=i,f=m["".concat(c,".").concat(d)]||m[d]||u[d]||o;return n?r.createElement(f,a(a({ref:t},p),{},{components:n})):r.createElement(f,a({ref:t},p))}));function d(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var o=n.length,a=new Array(o);a[0]=m;var l={};for(var c in t)hasOwnProperty.call(t,c)&&(l[c]=t[c]);l.originalType=e,l.mdxType="string"==typeof e?e:i,a[1]=l;for(var s=2;s=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var c=r.createContext({}),s=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):a(a({},t),e)),n},p=function(e){var t=s(e.components);return r.createElement(c.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,o=e.originalType,c=e.parentName,p=l(e,["components","mdxType","originalType","parentName"]),m=s(n),d=i,f=m["".concat(c,".").concat(d)]||m[d]||u[d]||o;return n?r.createElement(f,a(a({ref:t},p),{},{components:n})):r.createElement(f,a({ref:t},p))}));function d(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var o=n.length,a=new Array(o);a[0]=m;var l={};for(var c in t)hasOwnProperty.call(t,c)&&(l[c]=t[c]);l.originalType=e,l.mdxType="string"==typeof e?e:i,a[1]=l;for(var s=2;s=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),c=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(s.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,s=e.parentName,u=l(e,["components","mdxType","originalType","parentName"]),m=c(n),d=a,h=m["".concat(s,".").concat(d)]||m[d]||p[d]||o;return n?r.createElement(h,i(i({ref:t},u),{},{components:n})):r.createElement(h,i({ref:t},u))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:a,i[1]=l;for(var c=2;c=0&&o<=c&&a<=s&&r>=0)||(t.scrollIntoView({block:"center",behavior:"smooth"}),t.classList.add(l),setTimeout((function(){return t.classList.remove(l)}),2e3))}),150))},C=function(e){var t,n;switch(e.keyCode){case c:var r=N.indexOf(e.target)+1;n=N[r]||N[0];break;case s:var a=N.indexOf(e.target)-1;n=N[a]||N[N.length-1]}null==(t=n)||t.focus()};return r.createElement("div",{className:"tabs-container"},r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":n},d)},p.map((function(e){var t=e.value,n=e.label;return r.createElement("li",{role:"tab",tabIndex:v===t?0:-1,"aria-selected":v===t,className:(0,o.Z)("tabs__item",i,{"tabs__item--active":v===t}),key:t,ref:function(e){return N.push(e)},onKeyDown:C,onFocus:P,onClick:P},n)}))),t?(0,r.cloneElement)(y.filter((function(e){return e.props.value===v}))[0],{className:"margin-vert--md"}):r.createElement("div",{className:"margin-vert--md"},y.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==v})}))))}},9443:function(e,t,n){var r=(0,n(7294).createContext)(void 0);t.Z=r},944:function(e,t,n){var r=n(7294),a=n(9443);t.Z=function(){var e=(0,r.useContext)(a.Z);if(null==e)throw new Error('"useUserPreferencesContext" is used outside of "Layout" component.');return e}},1378:function(e,t,n){n.r(t),n.d(t,{contentTitle:function(){return s},default:function(){return m},frontMatter:function(){return l},metadata:function(){return c},toc:function(){return u}});var r=n(7462),a=n(3366),o=(n(7294),n(3905)),i=(n(1395),n(8215),["components"]),l={id:"installation"},s="Installation",c={unversionedId:"getting-started/installation",id:"getting-started/installation",isDocsHomePage:!1,title:"Installation",description:"Requirements",source:"@site/../docs/getting-started/installation.mdx",sourceDirName:"getting-started",slug:"/getting-started/installation",permalink:"/MS-AMP/docs/getting-started/installation",editUrl:"https://github.com/azure/MS-AMP/edit/main/website/../docs/getting-started/installation.mdx",version:"current",frontMatter:{id:"installation"},sidebar:"docs",previous:{title:"Introduction",permalink:"/MS-AMP/docs/introduction"},next:{title:"Run Examples",permalink:"/MS-AMP/docs/getting-started/run-msamp"}},u=[{value:"Requirements",id:"requirements",children:[]},{value:"Use Docker",id:"use-docker",children:[]},{value:"Install from source",id:"install-from-source",children:[]}],p={toc:u};function m(e){var t=e.components,n=(0,a.Z)(e,i);return(0,o.kt)("wrapper",(0,r.Z)({},p,n,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"installation"},"Installation"),(0,o.kt)("h2",{id:"requirements"},"Requirements"),(0,o.kt)("p",null,"Here're the system requirements for MS-AMP."),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later."),(0,o.kt)("li",{parentName:"ul"},"Nvidia GPU(e.g. H100/A100) and compatible drivers should be installed correctly."),(0,o.kt)("li",{parentName:"ul"},"Driver version can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"nvidia-smi"),"."),(0,o.kt)("li",{parentName:"ul"},"Python version 3.7 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"python3 --version"),")."),(0,o.kt)("li",{parentName:"ul"},"Pip version 18.0 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"python3 -m pip --version"),")."),(0,o.kt)("li",{parentName:"ul"},"CUDA version 11 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"nvcc --version"),")."),(0,o.kt)("li",{parentName:"ul"},"PyTorch version 1.14 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},'python -c "import torch; print(torch.__version__)"'),").")),(0,o.kt)("p",null,"You can try MS-AMP in two ways: Using Docker or installing from source: "),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Using Docker is a convenient way to get started with MS-AMP. You can use the pre-built Docker image to quickly set up an environment for running MS-AMP. "),(0,o.kt)("li",{parentName:"ul"},"On the other hand, installing from source gives you more control over the installation process and allows you to customize the installation to your needs.")),(0,o.kt)("h2",{id:"use-docker"},"Use Docker"),(0,o.kt)("p",null,"You can try the latest MS-AMP Docker container with the following commands:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"sudo docker run -it -d --name=msampcu121 --privileged --net=host --ipc=host --gpus=all -v /:/hostroot ghcr.io/azure/msamp:main-cuda12.1 bash\nsudo docker exec -it msampcu121 bash\n")),(0,o.kt)("p",null,"MS-AMP is pre-installed in Docker container and you can verify it by running:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"python -c 'import msamp;print(msamp.__version__)'\n")),(0,o.kt)("p",null,"We also provide stable Docker images ",(0,o.kt)("a",{parentName:"p",href:"/MS-AMP/docs/user-tutorial/container-images"},"here"),". "),(0,o.kt)("h2",{id:"install-from-source"},"Install from source"),(0,o.kt)("p",null,"We strongly recommend using ",(0,o.kt)("a",{parentName:"p",href:"https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch"},"PyTorch NGC Container")," to avoid messing up local environment.",(0,o.kt)("br",{parentName:"p"}),"\n","For example, to start PyTorch 2.1 container, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"sudo docker run -it -d --name=msamp --privileged --net=host --ipc=host --gpus=all nvcr.io/nvidia/pytorch:23.04-py3 bash\nsudo docker exec -it msamp bash\n")),(0,o.kt)("p",null,"Then, you can clone the source from GitHub."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"git clone https://github.com/Azure/MS-AMP.git\ncd MS-AMP\ngit submodule update --init --recursive\n")),(0,o.kt)("p",null,"If you want to train model with multiple GPU, you need to install MSCCL to support FP8. Please note that the compilation of MSCCL may take ~40 minutes on A100 nodes and ~7 minutes on H100 node."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'cd third_party/msccl\n\n# A100\nmake -j src.build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"\n# H100\nmake -j src.build NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"\n\napt-get update\napt install build-essential devscripts debhelper fakeroot\nmake pkg.debian.build\ndpkg -i build/pkg/deb/libnccl2_*.deb\ndpkg -i build/pkg/deb/libnccl-dev_2*.deb\n\ncd -\n')),(0,o.kt)("p",null,"Then, you can install MS-AMP from source."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"python3 -m pip install --upgrade pip\npython3 -m pip install .\nmake postinstall\n")),(0,o.kt)("p",null,"Before using MS-AMP, you need to preload msampfp8 library and it's depdencies:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'NCCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libnccl.so # Change as needed\nexport LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:${NCCL_LIBRARY}:${LD_PRELOAD}"\n')),(0,o.kt)("p",null,"After that, you can verify the installation by running:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'python3 -c "import msamp; print(msamp.__version__)"\n')))}m.isMDXComponent=!0},6010:function(e,t,n){function r(e){var t,n,a="";if("string"==typeof e||"number"==typeof e)a+=e;else if("object"==typeof e)if(Array.isArray(e))for(t=0;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),c=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(s.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,s=e.parentName,u=l(e,["components","mdxType","originalType","parentName"]),m=c(n),d=a,h=m["".concat(s,".").concat(d)]||m[d]||p[d]||o;return n?r.createElement(h,i(i({ref:t},u),{},{components:n})):r.createElement(h,i({ref:t},u))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:a,i[1]=l;for(var c=2;c=0&&o<=c&&a<=s&&r>=0)||(t.scrollIntoView({block:"center",behavior:"smooth"}),t.classList.add(l),setTimeout((function(){return t.classList.remove(l)}),2e3))}),150))},C=function(e){var t,n;switch(e.keyCode){case c:var r=N.indexOf(e.target)+1;n=N[r]||N[0];break;case s:var a=N.indexOf(e.target)-1;n=N[a]||N[N.length-1]}null==(t=n)||t.focus()};return r.createElement("div",{className:"tabs-container"},r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":n},d)},p.map((function(e){var t=e.value,n=e.label;return r.createElement("li",{role:"tab",tabIndex:v===t?0:-1,"aria-selected":v===t,className:(0,o.Z)("tabs__item",i,{"tabs__item--active":v===t}),key:t,ref:function(e){return N.push(e)},onKeyDown:C,onFocus:P,onClick:P},n)}))),t?(0,r.cloneElement)(y.filter((function(e){return e.props.value===v}))[0],{className:"margin-vert--md"}):r.createElement("div",{className:"margin-vert--md"},y.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==v})}))))}},9443:function(e,t,n){var r=(0,n(7294).createContext)(void 0);t.Z=r},944:function(e,t,n){var r=n(7294),a=n(9443);t.Z=function(){var e=(0,r.useContext)(a.Z);if(null==e)throw new Error('"useUserPreferencesContext" is used outside of "Layout" component.');return e}},1378:function(e,t,n){n.r(t),n.d(t,{contentTitle:function(){return s},default:function(){return m},frontMatter:function(){return l},metadata:function(){return c},toc:function(){return u}});var r=n(7462),a=n(3366),o=(n(7294),n(3905)),i=(n(1395),n(8215),["components"]),l={id:"installation"},s="Installation",c={unversionedId:"getting-started/installation",id:"getting-started/installation",isDocsHomePage:!1,title:"Installation",description:"Requirements",source:"@site/../docs/getting-started/installation.mdx",sourceDirName:"getting-started",slug:"/getting-started/installation",permalink:"/MS-AMP/docs/getting-started/installation",editUrl:"https://github.com/azure/MS-AMP/edit/main/website/../docs/getting-started/installation.mdx",version:"current",frontMatter:{id:"installation"},sidebar:"docs",previous:{title:"Introduction",permalink:"/MS-AMP/docs/introduction"},next:{title:"Run Examples",permalink:"/MS-AMP/docs/getting-started/run-msamp"}},u=[{value:"Requirements",id:"requirements",children:[]},{value:"Use Docker",id:"use-docker",children:[]},{value:"Install from source",id:"install-from-source",children:[]}],p={toc:u};function m(e){var t=e.components,n=(0,a.Z)(e,i);return(0,o.kt)("wrapper",(0,r.Z)({},p,n,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"installation"},"Installation"),(0,o.kt)("h2",{id:"requirements"},"Requirements"),(0,o.kt)("p",null,"Here're the system requirements for MS-AMP."),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later."),(0,o.kt)("li",{parentName:"ul"},"Nvidia GPU(e.g. H100/A100) and compatible drivers should be installed correctly."),(0,o.kt)("li",{parentName:"ul"},"Driver version can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"nvidia-smi"),"."),(0,o.kt)("li",{parentName:"ul"},"Python version 3.7 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"python3 --version"),")."),(0,o.kt)("li",{parentName:"ul"},"Pip version 18.0 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"python3 -m pip --version"),")."),(0,o.kt)("li",{parentName:"ul"},"CUDA version 11 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},"nvcc --version"),")."),(0,o.kt)("li",{parentName:"ul"},"PyTorch version 1.14 or later (which can be checked by running ",(0,o.kt)("inlineCode",{parentName:"li"},'python -c "import torch; print(torch.__version__)"'),").")),(0,o.kt)("p",null,"You can try MS-AMP in two ways: Using Docker or installing from source. "),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Using Docker is a convenient way to get started with MS-AMP. You can use the pre-built Docker image to quickly set up an environment for running MS-AMP. "),(0,o.kt)("li",{parentName:"ul"},"On the other hand, installing from source gives you more control over the installation process and allows you to customize the installation to your needs.")),(0,o.kt)("h2",{id:"use-docker"},"Use Docker"),(0,o.kt)("p",null,"You can try the latest MS-AMP Docker container with the following commands:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"sudo docker run -it -d --name=msampcu122 --privileged --net=host --ipc=host --gpus=all -v /:/hostroot ghcr.io/azure/msamp:main-cuda12.2 bash\nsudo docker exec -it msampcu122 bash\n")),(0,o.kt)("p",null,"MS-AMP is pre-installed in Docker container and you can verify it by running:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"python -c 'import msamp;print(msamp.__version__)'\n")),(0,o.kt)("p",null,"We also provide stable Docker images ",(0,o.kt)("a",{parentName:"p",href:"/MS-AMP/docs/user-tutorial/container-images"},"here"),". "),(0,o.kt)("h2",{id:"install-from-source"},"Install from source"),(0,o.kt)("p",null,"We strongly recommend using ",(0,o.kt)("a",{parentName:"p",href:"https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch"},"PyTorch NGC Container")," to avoid messing up local environment.",(0,o.kt)("br",{parentName:"p"}),"\n","For example, to start PyTorch 2.1 container, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"sudo docker run -it -d --name=msamp --privileged --net=host --ipc=host --gpus=all nvcr.io/nvidia/pytorch:23.10-py3 bash\nsudo docker exec -it msamp bash\n")),(0,o.kt)("p",null,"Then, you can clone the source from GitHub."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"git clone https://github.com/Azure/MS-AMP.git\ncd MS-AMP\ngit submodule update --init --recursive\n")),(0,o.kt)("p",null,"If you want to train model with multiple GPU, you need to install MSCCL to support FP8. Please note that the compilation of MSCCL may take ~40 minutes on A100 nodes and ~7 minutes on H100 node."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'cd third_party/msccl\n\n# A100\nmake -j src.build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"\n# H100\nmake -j src.build NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"\n\napt-get update\napt install build-essential devscripts debhelper fakeroot\nmake pkg.debian.build\ndpkg -i build/pkg/deb/libnccl2_*.deb\ndpkg -i build/pkg/deb/libnccl-dev_2*.deb\n\ncd -\n')),(0,o.kt)("p",null,"Then, you can install MS-AMP from source."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"python3 -m pip install --upgrade pip\npython3 -m pip install .\nmake postinstall\n")),(0,o.kt)("p",null,"Before using MS-AMP, you need to preload msampfp8 library and it's depdencies:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'NCCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libnccl.so # Change as needed\nexport LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:${NCCL_LIBRARY}:${LD_PRELOAD}"\n')),(0,o.kt)("p",null,"After that, you can verify the installation by running:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},'python3 -c "import msamp; print(msamp.__version__)"\n')))}m.isMDXComponent=!0},6010:function(e,t,n){function r(e){var t,n,a="";if("string"==typeof e||"number"==typeof e)a+=e;else if("object"==typeof e)if(Array.isArray(e))for(t=0;t=o)&&Object.keys(f.O).every((function(e){return f.O[e](n[u])}))?n.splice(u--,1):(c=!1,o0&&e[d-1][2]>o;d--)e[d]=e[d-1];e[d]=[n,r,o]},f.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return f.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},f.t=function(e,r){if(1&r&&(e=this(e)),8&r)return e;if("object"==typeof e&&e){if(4&r&&e.__esModule)return e;if(16&r&&"function"==typeof e.then)return e}var o=Object.create(null);f.r(o);var a={};t=t||[null,n({}),n([]),n(n)];for(var c=2&r&&e;"object"==typeof c&&!~t.indexOf(c);c=n(c))Object.getOwnPropertyNames(c).forEach((function(t){a[t]=function(){return e[t]}}));return a.default=function(){return e},f.d(o,a),o},f.d=function(e,t){for(var n in t)f.o(t,n)&&!f.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},f.f={},f.e=function(e){return Promise.all(Object.keys(f.f).reduce((function(t,n){return f.f[n](e,t),t}),[]))},f.u=function(e){return"assets/js/"+({13:"01a85c17",53:"935f2afb",89:"a6aa9e1f",103:"ccc49370",124:"16b7cda5",195:"c4f5d8e4",233:"a7bdf7d4",267:"a749b02f",349:"53e18611",368:"5296987c",402:"139c734a",449:"fd74fd90",498:"64df791d",501:"7686a17a",514:"1be78505",535:"814f3328",545:"7f4363ed",573:"440c4558",600:"9d996502",610:"6875c492",653:"324723b9",654:"16339936",735:"4b6e68bc",766:"d663e833",828:"1a0e0a2d",873:"76ce4391",918:"17896441",987:"19b6a884"}[e]||e)+"."+{13:"56efb409",40:"4c6b2ba2",53:"2c96c96c",89:"b9a4bb49",103:"fe64699c",124:"02dddea5",195:"fcebc909",233:"349b81b2",267:"ac65af5f",300:"3b1b1f99",349:"e716735c",368:"cb7325f5",402:"0a4befae",449:"a806957a",486:"796b6da0",498:"b68fa262",501:"9464d35d",514:"9c8483b0",518:"89eb250f",535:"bd569702",545:"4abd2d6c",573:"baf6f0bd",600:"8196d019",608:"8ceec28b",610:"394b8da8",653:"a9b5b962",654:"9ff367a4",735:"c4dc24cc",766:"6dd3889f",828:"205cb4fb",849:"fde85062",873:"9c8ce3a2",918:"f73d6be3",945:"f34078ab",987:"6a6f4c0a"}[e]+".js"},f.miniCssF=function(e){return"assets/css/styles.9f188c0d.css"},f.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),f.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="msamp-website:",f.l=function(e,t,n,a){if(r[e])r[e].push(t);else{var c,u;if(void 0!==n)for(var i=document.getElementsByTagName("script"),d=0;d=a)&&Object.keys(c.O).every((function(e){return c.O[e](n[u])}))?n.splice(u--,1):(f=!1,a0&&e[d-1][2]>a;d--)e[d]=e[d-1];e[d]=[n,r,a]},c.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return c.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},c.t=function(e,r){if(1&r&&(e=this(e)),8&r)return e;if("object"==typeof e&&e){if(4&r&&e.__esModule)return e;if(16&r&&"function"==typeof e.then)return e}var a=Object.create(null);c.r(a);var o={};t=t||[null,n({}),n([]),n(n)];for(var f=2&r&&e;"object"==typeof f&&!~t.indexOf(f);f=n(f))Object.getOwnPropertyNames(f).forEach((function(t){o[t]=function(){return e[t]}}));return o.default=function(){return e},c.d(a,o),a},c.d=function(e,t){for(var n in t)c.o(t,n)&&!c.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},c.f={},c.e=function(e){return Promise.all(Object.keys(c.f).reduce((function(t,n){return c.f[n](e,t),t}),[]))},c.u=function(e){return"assets/js/"+({13:"01a85c17",53:"935f2afb",89:"a6aa9e1f",103:"ccc49370",124:"16b7cda5",195:"c4f5d8e4",233:"a7bdf7d4",267:"a749b02f",349:"53e18611",368:"5296987c",402:"139c734a",449:"fd74fd90",498:"64df791d",501:"7686a17a",514:"1be78505",535:"814f3328",545:"7f4363ed",573:"440c4558",600:"9d996502",610:"6875c492",653:"324723b9",654:"16339936",735:"4b6e68bc",766:"d663e833",828:"1a0e0a2d",873:"76ce4391",918:"17896441",987:"19b6a884"}[e]||e)+"."+{13:"56efb409",40:"4c6b2ba2",53:"2c96c96c",89:"b9a4bb49",103:"fe64699c",124:"02dddea5",195:"fcebc909",233:"349b81b2",267:"ac65af5f",300:"3b1b1f99",349:"019b3b8a",368:"cb7325f5",402:"0a4befae",449:"a806957a",486:"796b6da0",498:"b68fa262",501:"9464d35d",514:"9c8483b0",518:"89eb250f",535:"bd569702",545:"89d9d20c",573:"baf6f0bd",600:"8196d019",608:"8ceec28b",610:"394b8da8",653:"a9b5b962",654:"9ff367a4",735:"c4dc24cc",766:"6dd3889f",828:"205cb4fb",849:"fde85062",873:"9c8ce3a2",918:"f73d6be3",945:"f34078ab",987:"6a6f4c0a"}[e]+".js"},c.miniCssF=function(e){return"assets/css/styles.9f188c0d.css"},c.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),c.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},a="msamp-website:",c.l=function(e,t,n,o){if(r[e])r[e].push(t);else{var f,u;if(void 0!==n)for(var i=document.getElementsByTagName("script"),d=0;d Blog | MS-AMP - +

Releasing MS-AMP v0.3

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.3.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.3.0 Release Notes#

MS-AMP Improvements#

  • Integrate latest Transformer Engine into MS-AMP
  • Integrate with latest Megatron-LM
  • Add a website for MS-AMP and improve documents
  • Add custom DistributedDataParallel which supports FP8 and computation/computation overlap
  • Refactor code in dist_op module
  • Support UT for distributed testing
  • Integrate with MSCCL

MS-AMP-Examples Improvements#

  • Support pretrain GPT-3 with Megatron-LM and MS-AMP
  • Provide a tool to print the traffic per second of NVLINK and InfiniBand
  • Print tflops and throughput metrics in all the examples

Document Improvements#

  • Add performance number in Introduction page
  • Enhance Usage page and Optimization Level page
  • Add Container Images page
  • Add Developer Guide section

Releasing MS-AMP v0.2

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.2.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.2.0 Release Notes#

MS-AMP Improvements#

  • Add O3 optimization for supporting FP8 in distributed training frameworks
  • Support ScalingTensor in functional.linear
  • Support customized attributes in FP8Linear
  • Improve performance
  • Add docker file for pytorch1.14+cuda11.8 and pytorch2.1+cuda12.1
  • Support pytorch 2.1
  • Add performance result and TE result in homepage
  • Cache TE build in pipeline

MS-AMP-Examples Improvements#

Add 3 examples using MS-AMP:

- + \ No newline at end of file diff --git a/blog/release-msamp-v0.2/index.html b/blog/release-msamp-v0.2/index.html index 059f68c8..11fb964b 100644 --- a/blog/release-msamp-v0.2/index.html +++ b/blog/release-msamp-v0.2/index.html @@ -7,13 +7,13 @@ Releasing MS-AMP v0.2 | MS-AMP - +

Releasing MS-AMP v0.2

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.2.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.2.0 Release Notes#

MS-AMP Improvements#

  • Add O3 optimization for supporting FP8 in distributed training frameworks
  • Support ScalingTensor in functional.linear
  • Support customized attributes in FP8Linear
  • Improve performance
  • Add docker file for pytorch1.14+cuda11.8 and pytorch2.1+cuda12.1
  • Support pytorch 2.1
  • Add performance result and TE result in homepage
  • Cache TE build in pipeline

MS-AMP-Examples Improvements#

Add 3 examples using MS-AMP:

- + \ No newline at end of file diff --git a/blog/release-msamp-v0.3/index.html b/blog/release-msamp-v0.3/index.html index e55abf48..3a545fb8 100644 --- a/blog/release-msamp-v0.3/index.html +++ b/blog/release-msamp-v0.3/index.html @@ -7,13 +7,13 @@ Releasing MS-AMP v0.3 | MS-AMP - +

Releasing MS-AMP v0.3

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.3.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.3.0 Release Notes#

MS-AMP Improvements#

  • Integrate latest Transformer Engine into MS-AMP
  • Integrate with latest Megatron-LM
  • Add a website for MS-AMP and improve documents
  • Add custom DistributedDataParallel which supports FP8 and computation/computation overlap
  • Refactor code in dist_op module
  • Support UT for distributed testing
  • Integrate with MSCCL

MS-AMP-Examples Improvements#

  • Support pretrain GPT-3 with Megatron-LM and MS-AMP
  • Provide a tool to print the traffic per second of NVLINK and InfiniBand
  • Print tflops and throughput metrics in all the examples

Document Improvements#

  • Add performance number in Introduction page
  • Enhance Usage page and Optimization Level page
  • Add Container Images page
  • Add Developer Guide section
- + \ No newline at end of file diff --git a/blog/tags/announcement/index.html b/blog/tags/announcement/index.html index 47f930bf..3ec37b97 100644 --- a/blog/tags/announcement/index.html +++ b/blog/tags/announcement/index.html @@ -7,13 +7,13 @@ 2 posts tagged with "announcement" | MS-AMP - +

2 posts tagged with "announcement"

View All Tags

Releasing MS-AMP v0.3

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.3.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.3.0 Release Notes#

MS-AMP Improvements#

  • Integrate latest Transformer Engine into MS-AMP
  • Integrate with latest Megatron-LM
  • Add a website for MS-AMP and improve documents
  • Add custom DistributedDataParallel which supports FP8 and computation/computation overlap
  • Refactor code in dist_op module
  • Support UT for distributed testing
  • Integrate with MSCCL

MS-AMP-Examples Improvements#

  • Support pretrain GPT-3 with Megatron-LM and MS-AMP
  • Provide a tool to print the traffic per second of NVLINK and InfiniBand
  • Print tflops and throughput metrics in all the examples

Document Improvements#

  • Add performance number in Introduction page
  • Enhance Usage page and Optimization Level page
  • Add Container Images page
  • Add Developer Guide section

Releasing MS-AMP v0.2

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.2.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.2.0 Release Notes#

MS-AMP Improvements#

  • Add O3 optimization for supporting FP8 in distributed training frameworks
  • Support ScalingTensor in functional.linear
  • Support customized attributes in FP8Linear
  • Improve performance
  • Add docker file for pytorch1.14+cuda11.8 and pytorch2.1+cuda12.1
  • Support pytorch 2.1
  • Add performance result and TE result in homepage
  • Cache TE build in pipeline

MS-AMP-Examples Improvements#

Add 3 examples using MS-AMP:

- + \ No newline at end of file diff --git a/blog/tags/index.html b/blog/tags/index.html index ba4c6fd5..5e356bb4 100644 --- a/blog/tags/index.html +++ b/blog/tags/index.html @@ -7,13 +7,13 @@ Tags | MS-AMP - +
- + \ No newline at end of file diff --git a/blog/tags/ms-amp/index.html b/blog/tags/ms-amp/index.html index e039adff..51952c8d 100644 --- a/blog/tags/ms-amp/index.html +++ b/blog/tags/ms-amp/index.html @@ -7,13 +7,13 @@ 2 posts tagged with "ms-amp" | MS-AMP - +

2 posts tagged with "ms-amp"

View All Tags

Releasing MS-AMP v0.3

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.3.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.3.0 Release Notes#

MS-AMP Improvements#

  • Integrate latest Transformer Engine into MS-AMP
  • Integrate with latest Megatron-LM
  • Add a website for MS-AMP and improve documents
  • Add custom DistributedDataParallel which supports FP8 and computation/computation overlap
  • Refactor code in dist_op module
  • Support UT for distributed testing
  • Integrate with MSCCL

MS-AMP-Examples Improvements#

  • Support pretrain GPT-3 with Megatron-LM and MS-AMP
  • Provide a tool to print the traffic per second of NVLINK and InfiniBand
  • Print tflops and throughput metrics in all the examples

Document Improvements#

  • Add performance number in Introduction page
  • Enhance Usage page and Optimization Level page
  • Add Container Images page
  • Add Developer Guide section

Releasing MS-AMP v0.2

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.2.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.2.0 Release Notes#

MS-AMP Improvements#

  • Add O3 optimization for supporting FP8 in distributed training frameworks
  • Support ScalingTensor in functional.linear
  • Support customized attributes in FP8Linear
  • Improve performance
  • Add docker file for pytorch1.14+cuda11.8 and pytorch2.1+cuda12.1
  • Support pytorch 2.1
  • Add performance result and TE result in homepage
  • Cache TE build in pipeline

MS-AMP-Examples Improvements#

Add 3 examples using MS-AMP:

- + \ No newline at end of file diff --git a/blog/tags/release/index.html b/blog/tags/release/index.html index 3b05c1f9..1edd1f76 100644 --- a/blog/tags/release/index.html +++ b/blog/tags/release/index.html @@ -7,13 +7,13 @@ 2 posts tagged with "release" | MS-AMP - +

2 posts tagged with "release"

View All Tags

Releasing MS-AMP v0.3

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.3.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.3.0 Release Notes#

MS-AMP Improvements#

  • Integrate latest Transformer Engine into MS-AMP
  • Integrate with latest Megatron-LM
  • Add a website for MS-AMP and improve documents
  • Add custom DistributedDataParallel which supports FP8 and computation/computation overlap
  • Refactor code in dist_op module
  • Support UT for distributed testing
  • Integrate with MSCCL

MS-AMP-Examples Improvements#

  • Support pretrain GPT-3 with Megatron-LM and MS-AMP
  • Provide a tool to print the traffic per second of NVLINK and InfiniBand
  • Print tflops and throughput metrics in all the examples

Document Improvements#

  • Add performance number in Introduction page
  • Enhance Usage page and Optimization Level page
  • Add Container Images page
  • Add Developer Guide section

Releasing MS-AMP v0.2

ยท One min read
MS-AMP Team

We are very happy to announce that MS-AMP 0.2.0 version is officially released today!

You can install and try MS-AMP by following Getting Started Tutorial.

MS-AMP 0.2.0 Release Notes#

MS-AMP Improvements#

  • Add O3 optimization for supporting FP8 in distributed training frameworks
  • Support ScalingTensor in functional.linear
  • Support customized attributes in FP8Linear
  • Improve performance
  • Add docker file for pytorch1.14+cuda11.8 and pytorch2.1+cuda12.1
  • Support pytorch 2.1
  • Add performance result and TE result in homepage
  • Cache TE build in pipeline

MS-AMP-Examples Improvements#

Add 3 examples using MS-AMP:

- + \ No newline at end of file diff --git a/docs/developer-guides/contributing/index.html b/docs/developer-guides/contributing/index.html index 2d65c924..3b84634c 100644 --- a/docs/developer-guides/contributing/index.html +++ b/docs/developer-guides/contributing/index.html @@ -7,7 +7,7 @@ Contributing | MS-AMP - + @@ -19,7 +19,7 @@ provided by the bot. You will only need to do this once across all repos using our CLA.

This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.

How to Contribute#

Contribute New Feature#

MS-AMP is an open-source project. Your participation and contribution are highly appreciated. There are several important things you need know before contributing new feature to this project:

What content can be added to MS-AMP#

  1. Bug fixes for existing features.
  2. Performance improvement.
  3. New features such as support for new distributed training framework.

If you would like to contribute a new feature on MS-AMP, please submit your proposal first. In GitHub Issues module, choose Enhancement Request to finish the submission. If the proposal is accepted, you can submit pull requests to origin main branch.

Contribution steps#

If you would like to contribute to the project, please follow below steps of joint development on GitHub.

  1. Fork the repo first to your personal GitHub account.
  2. Checkout from main branch for feature development.
  3. When you finish the feature, please fetch the latest code from origin repo, merge to your branch and resolve conflict.
  4. Submit pull requests to origin main branch.
  5. Please note that there might be comments or questions from reviewers. It will need your help to update the pull request.
- + \ No newline at end of file diff --git a/docs/developer-guides/development/index.html b/docs/developer-guides/development/index.html index 60d2f82e..2ad549d5 100644 --- a/docs/developer-guides/development/index.html +++ b/docs/developer-guides/development/index.html @@ -7,7 +7,7 @@ Development | MS-AMP - + @@ -15,7 +15,7 @@

Development

If you want to develop new feature, please follow below steps to set up development environment.

We suggest you to use Visual Studio Code and install the recommended extensions for this project. You can also develop online with GitHub Codespaces.

Check Environment#

Follow System Requirements.

Set up#

Clone code.

git clone --recurse-submodules https://github.com/azure/MS-AMPcd MS-AMP

Install MS-AMP.

python3 -m pip install --upgrade pippython3 -m pip install -e .[test]make postinstall

Install MSCCL and preload msamp_dist library.

cd third_party/msccl# H100make -j src.build NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"apt-get updateapt install build-essential devscripts debhelper fakerootmake pkg.debian.builddpkg -i build/pkg/deb/libnccl2_*.debdpkg -i build/pkg/deb/libnccl-dev_2*.deb
 cd -NCCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libnccl.so # Change as neededexport LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:${NCCL_LIBRARY}:${LD_PRELOAD}"

Lint and Test#

Format code using yapf.

python3 setup.py format

Check code style with mypy and flake8.

python3 setup.py lint

Run unit tests.

python3 setup.py test

Open a pull request to main branch on GitHub.

- + \ No newline at end of file diff --git a/docs/developer-guides/using-docker/index.html b/docs/developer-guides/using-docker/index.html index c1fc3470..57d4e82c 100644 --- a/docs/developer-guides/using-docker/index.html +++ b/docs/developer-guides/using-docker/index.html @@ -7,13 +7,13 @@ Using Docker | MS-AMP - +

Using Docker

MS-AMP provides a Dockerfile to simplify the process of setting up the development environment. Here are some guides on how to build images and start containers during development

Build image#

You need to clone the code first before building the image.

export DOCKER_BUILDKIT=1docker buildx build \  --platform linux/amd64 --cache-to type=inline,mode=max \  --tag msamp-dev-cuda121 --file dockerfile/torch2.1-cuda12.1.dockerfile .

Run container#

docker run \  -itd --name=msamp-dev \  --privileged --net=host --ipc=host \  --gpus=all \  -w /root -v /mnt:/mnt \  msamp-dev-cuda121 bash
- + \ No newline at end of file diff --git a/docs/getting-started/installation/index.html b/docs/getting-started/installation/index.html index aa1293cb..4ce06669 100644 --- a/docs/getting-started/installation/index.html +++ b/docs/getting-started/installation/index.html @@ -7,17 +7,17 @@ Installation | MS-AMP - +
-

Installation

Requirements#

Here're the system requirements for MS-AMP.

  • Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later.
  • Nvidia GPU(e.g. H100/A100) and compatible drivers should be installed correctly.
  • Driver version can be checked by running nvidia-smi.
  • Python version 3.7 or later (which can be checked by running python3 --version).
  • Pip version 18.0 or later (which can be checked by running python3 -m pip --version).
  • CUDA version 11 or later (which can be checked by running nvcc --version).
  • PyTorch version 1.14 or later (which can be checked by running python -c "import torch; print(torch.__version__)").

You can try MS-AMP in two ways: Using Docker or installing from source:

  • Using Docker is a convenient way to get started with MS-AMP. You can use the pre-built Docker image to quickly set up an environment for running MS-AMP.
  • On the other hand, installing from source gives you more control over the installation process and allows you to customize the installation to your needs.

Use Docker#

You can try the latest MS-AMP Docker container with the following commands:

sudo docker run -it -d --name=msampcu121 --privileged --net=host --ipc=host --gpus=all -v /:/hostroot ghcr.io/azure/msamp:main-cuda12.1 bashsudo docker exec -it msampcu121 bash

MS-AMP is pre-installed in Docker container and you can verify it by running:

python -c 'import msamp;print(msamp.__version__)'

We also provide stable Docker images here.

Install from source#

We strongly recommend using PyTorch NGC Container to avoid messing up local environment.
-For example, to start PyTorch 2.1 container, run the following command:

sudo docker run -it -d --name=msamp --privileged --net=host --ipc=host --gpus=all nvcr.io/nvidia/pytorch:23.04-py3 bashsudo docker exec -it msamp bash

Then, you can clone the source from GitHub.

git clone https://github.com/Azure/MS-AMP.gitcd MS-AMPgit submodule update --init --recursive

If you want to train model with multiple GPU, you need to install MSCCL to support FP8. Please note that the compilation of MSCCL may take ~40 minutes on A100 nodes and ~7 minutes on H100 node.

cd third_party/msccl
+

Installation

Requirements#

Here're the system requirements for MS-AMP.

  • Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later.
  • Nvidia GPU(e.g. H100/A100) and compatible drivers should be installed correctly.
  • Driver version can be checked by running nvidia-smi.
  • Python version 3.7 or later (which can be checked by running python3 --version).
  • Pip version 18.0 or later (which can be checked by running python3 -m pip --version).
  • CUDA version 11 or later (which can be checked by running nvcc --version).
  • PyTorch version 1.14 or later (which can be checked by running python -c "import torch; print(torch.__version__)").

You can try MS-AMP in two ways: Using Docker or installing from source.

  • Using Docker is a convenient way to get started with MS-AMP. You can use the pre-built Docker image to quickly set up an environment for running MS-AMP.
  • On the other hand, installing from source gives you more control over the installation process and allows you to customize the installation to your needs.

Use Docker#

You can try the latest MS-AMP Docker container with the following commands:

sudo docker run -it -d --name=msampcu122 --privileged --net=host --ipc=host --gpus=all -v /:/hostroot ghcr.io/azure/msamp:main-cuda12.2 bashsudo docker exec -it msampcu122 bash

MS-AMP is pre-installed in Docker container and you can verify it by running:

python -c 'import msamp;print(msamp.__version__)'

We also provide stable Docker images here.

Install from source#

We strongly recommend using PyTorch NGC Container to avoid messing up local environment.
+For example, to start PyTorch 2.1 container, run the following command:

sudo docker run -it -d --name=msamp --privileged --net=host --ipc=host --gpus=all nvcr.io/nvidia/pytorch:23.10-py3 bashsudo docker exec -it msamp bash

Then, you can clone the source from GitHub.

git clone https://github.com/Azure/MS-AMP.gitcd MS-AMPgit submodule update --init --recursive

If you want to train model with multiple GPU, you need to install MSCCL to support FP8. Please note that the compilation of MSCCL may take ~40 minutes on A100 nodes and ~7 minutes on H100 node.

cd third_party/msccl
 # A100make -j src.build NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"# H100make -j src.build NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"
 apt-get updateapt install build-essential devscripts debhelper fakerootmake pkg.debian.builddpkg -i build/pkg/deb/libnccl2_*.debdpkg -i build/pkg/deb/libnccl-dev_2*.deb
 cd -

Then, you can install MS-AMP from source.

python3 -m pip install --upgrade pippython3 -m pip install .make postinstall

Before using MS-AMP, you need to preload msampfp8 library and it's depdencies:

NCCL_LIBRARY=/usr/lib/x86_64-linux-gnu/libnccl.so # Change as neededexport LD_PRELOAD="/usr/local/lib/libmsamp_dist.so:${NCCL_LIBRARY}:${LD_PRELOAD}"

After that, you can verify the installation by running:

python3 -c "import msamp; print(msamp.__version__)"
- + \ No newline at end of file diff --git a/docs/getting-started/run-msamp/index.html b/docs/getting-started/run-msamp/index.html index bc270ec7..81d54769 100644 --- a/docs/getting-started/run-msamp/index.html +++ b/docs/getting-started/run-msamp/index.html @@ -7,13 +7,13 @@ Run Examples | MS-AMP - +

Run Examples

After installing MS-AMP, you can run several simple examples using MS-AMP. Please note that before running these commands, you need to change work directory to examples.

MNIST#

1. Run mnist using single GPU#

python mnist.py --enable-msamp --opt-level=O2

2. Run mnist using multi GPUs in single node#

torchrun --nproc_per_node=8 mnist_ddp.py --enable-msamp --opt-level=O2

3. Run mnist using FSDP#

python mnist_fsdp.py --msamp

CIFAR10#

1. Run cifar10 using deepspeed#

deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json

2. Run cifar10 using deepspeed with msamp enabled#

deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config_msamp.json

3. Run cifar10 using deepspeed-ZeRO with msamp enabled#

deepspeed cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config_zero_msamp.json

4. Run cifar10 using deepspeed-ZeRO + TE with msamp enabled#

deepspeed cifar10_deepspeed_te.py --deepspeed --deepspeed_config ds_config_zero_te_msamp.json

For more comprehensive examples, please go to MS-AMP-Examples.

- + \ No newline at end of file diff --git a/docs/introduction/index.html b/docs/introduction/index.html index f5707451..b08c808e 100644 --- a/docs/introduction/index.html +++ b/docs/introduction/index.html @@ -7,13 +7,13 @@ Introduction | MS-AMP - +
-

Introduction

Features#

MS-AMP is an automatic mixed precision package for deep learning developed by Microsoft.

Features:

  • Support O1 optimization: Apply FP8 to weights and weight gradients and support FP8 in communication.
  • Support O2 optimization: Support FP8 for two optimizers(Adam and AdamW).
  • Support O3 optimization: Support FP8 for distributed parallel training and ZeRO optimizer, which is essential for training large scale model.
  • Provide four training examples applying MS-AMP: Swin-Transformer, DeiT, RoBERTa and GPT-3.

MS-AMP has the following benefit comparing with Transformer Engine:

  • Speed up memory-limited operations by accessing one byte compared to half or single-precision.
  • Reduce memory requirements for training models, enabling larger models.
  • Speed up communication for distributed model by transmitting lower precision gradients.
  • Reduce training time for large language models with larger minibatches.

Performance#

Model performance#

We evaluated the training loss and validation performance of four typical models, GPT-3, Swin-Transformer, DeiT and RoBERTa, using both MS-AMP and FP16/BF16 AMP. Our observations show that the models trained with MS-AMP achieved comparable performance to those trained using FP16/BF16 AMP. This demonstrates the effectiveness of the mixed FP8 in MS-AMP.

Here are the results for GPT-3, Swin-T, DeiT-S and RoBERTa-B.

image

image

System performance#

MS-AMP preserves high-precision's accuracy while using only a fraction of the memory footprint on a range of tasks, including GPT-3, DeiT and Swin Transformer. For example, when training GPT-175B on NVIDIA H100 platform, MS-AMP achieves a notable 42% reduction in real memory usage compared with BF16 mixed-precision approach and reduces training time by 17% compared with Transformer Engine. For small models, MS-AMP with O2 mode can achieve 44% memory saving for Swin-1.0B and 26% memory saving for ViT-1.2B, comparing with FP16 AMP.

Here are the resuls for GPT-3:

Image

Here, TP, PP, and DP represent tensor, pipeline, and data parallelism respectively. BS indicates batch size, while MFU denotes model FLOPs utilization. Weight-related communication contains the all-gather operator on weights and the reduce-scatter operator on weight gradients.

Here are the results for Swin-1.0B and ViT-1.2B.

Image

For detailed setting and results, please go to MS-AMP-Example.

- +

Introduction

Features#

MS-AMP is an automatic mixed precision package for deep learning developed by Microsoft.

Features:

  • Support O1 optimization: Apply FP8 to weights and weight gradients and support FP8 in communication.
  • Support O2 optimization: Support FP8 for two optimizers(Adam and AdamW).
  • Support O3 optimization: Support FP8 for distributed parallel training and ZeRO optimizer, which is essential for training large scale model.
  • Provide four training examples applying MS-AMP: Swin-Transformer, DeiT, RoBERTa and GPT-3.

MS-AMP has the following benefit comparing with Transformer Engine:

  • Speed up memory-limited operations by accessing one byte compared to half or single-precision.
  • Reduce memory requirements for training models, enabling larger models.
  • Speed up communication for distributed model by transmitting lower precision gradients.
  • Reduce training time for large language models with larger minibatches.

Performance#

Model performance#

We evaluated the training loss and validation performance of four typical models, GPT-3, Swin-Transformer, DeiT and RoBERTa, using both MS-AMP and FP16/BF16 AMP. Our observations show that the models trained with MS-AMP achieved comparable performance to those trained using FP16/BF16 AMP. This demonstrates the effectiveness of the mixed FP8 in MS-AMP.

Here are the results for GPT-3, Swin-T, DeiT-S and RoBERTa-B.

image

image

System performance#

MS-AMP preserves high-precision's accuracy while using only a fraction of the memory footprint on a range of tasks, including GPT-3, DeiT and Swin Transformer. For example, when training GPT-175B on NVIDIA H100 platform, MS-AMP achieves a notable 42% reduction in real memory usage compared with BF16 mixed-precision approach and reduces training time by 17% compared with Transformer Engine. For small models, MS-AMP with O2 mode can achieve 44% memory saving for Swin-1.0B and 26% memory saving for ViT-1.2B, comparing with FP16 AMP.

Here are the resuls for GPT-3:

Image

Here, TP, PP, and DP represent tensor, pipeline, and data parallelism respectively. BS indicates batch size, while MFU denotes model FLOPs utilization. Weight-related communication contains the all-gather operator on weights and the reduce-scatter operator on weight gradients.

Here are the results for Swin-1.0B and ViT-1.2B.

Image

For detailed setting and results, please go to MS-AMP-Example.

+ \ No newline at end of file diff --git a/docs/user-tutorial/container-images/index.html b/docs/user-tutorial/container-images/index.html index 9ec98852..b0fb6e3b 100644 --- a/docs/user-tutorial/container-images/index.html +++ b/docs/user-tutorial/container-images/index.html @@ -7,13 +7,13 @@ Container Images | MS-AMP - +

Container Images

MS-AMP provides a set of OCI-compliant container images, which are hosted on and GitHub Container Registry.

You can use MS-AMP image by ghcr.io/azure/msamp:${tag}, available tags are listed below for all stable versions.

Stable tagged versions#

TagDescription
v0.3.0-cuda12.1MS-AMP v0.3.0 with CUDA 12.1
v0.3.0-cuda11.8MS-AMP v0.3.0 with CUDA 11.8
v0.2.0-cuda12.1MS-AMP v0.2.0 with CUDA 12.1
v0.2.0-cuda11.8MS-AMP v0.2.0 with CUDA 11.8
- + \ No newline at end of file diff --git a/docs/user-tutorial/optimization-level/index.html b/docs/user-tutorial/optimization-level/index.html index b4a941c2..8dec4813 100644 --- a/docs/user-tutorial/optimization-level/index.html +++ b/docs/user-tutorial/optimization-level/index.html @@ -7,13 +7,13 @@ Optimization Level | MS-AMP - +

Optimization Level

Currently MS-AMP supports three optimization levels: O1 and O2 and O3. The three levels gradually incorporate 8-bit collective communication, optimizer and distributed parallel training in an incremental manner. Users can directly set O1/O2 using msamp.initialize and set O3 in config file when using DeepSpeed.

  • O1: We found that directly transitioning weight gradients from FP32 to FP8 in the Transformer Engine leads to a decrease in accuracy. However, this issue is resolved in O1 through the implementation of FP8 for weight gradients and AllReduce communication. This optimization also has the added benefits of saving GPU memory and reducing communication bandwidth.

  • O2: From O1 to O2, our main focus is on enabling the use of low-bit data formats for auxiliary tensors in the Adam/AdamW optimizer without any loss in accuracy. Specifically, we are able to maintain accuracy by representing the first-order optimizer state in FP8 and the second-order state in FP16. This optimization has the potential to save up to 62.5% of GPU memory for the optimizer when the model size is particularly large.

  • O3: This optimization level is specifically designed for FP8 support in distributed parallel training for large scale models. These frequently-used strategies include data parallelism, tensor parallelism, pipeline parallelism, sequence parallelism and ZeRO optimizer. ZeRO separates model weights into regular weights and master weights, with the former used for network forward/backward on each GPU, and the latter used for model updating in the optimizer. This separation allows us to use 8-bit data precision for regular weights and weight broadcasting, which reduces GPU memory and bandwidth usage even further.

Here are details of different MS-AMP optimization levels:

Optimization LevelComputation(GEMM)CommWeightMaster WeightWeight GradientOptimizer States
FP16 AMPFP16FP32FP32N/AFP32FP32+FP32
Nvidia TEFP8FP32FP32N/AFP32FP32+FP32
MS-AMP O1FP8FP8FP16N/AFP8FP32+FP32
MS-AMP O2FP8FP8FP16N/AFP8FP8+FP16
MS-AMP O3FP8FP8FP8FP16FP8FP8+FP16
- + \ No newline at end of file diff --git a/docs/user-tutorial/usage/index.html b/docs/user-tutorial/usage/index.html index 30a14846..3ed3856a 100644 --- a/docs/user-tutorial/usage/index.html +++ b/docs/user-tutorial/usage/index.html @@ -7,7 +7,7 @@ Use MS-AMP | MS-AMP - + @@ -19,7 +19,7 @@ # Initialize modelfrom msamp.fsdp import FsdpReplacerfrom msamp.fsdp import FP8FullyShardedDataParallelmy_auto_wrap_policy = ...model = FsdpReplacer.replace(model)model = FP8FullyShardedDataParallel(model, use_orig_params=True, auto_wrap_policy=my_auto_wrap_policy) # Initialize optimizerfrom msamp.optim import FSDPAdamoptimizer = FSDPAdam(model.parameters(), lr=3e-04)

Please note that currently we only support use_orig_params=True.

Usage in Megatron-DeepSpeed and Megatron-LM#

For integrating MS-AMP with Megatron-DeepSpeed and Megatron-LM, you need to make some code changes. We provide a patch as a reference for the integration. Here is the instruction of integrating MS-AMP with Megatron-DeepSpeed/Megatron-LM and how to run gpt-3 with MS-AMP.

Runnable, simple examples demonstrating good practices can be found here. For more comprehensive examples, please go to MS-AMP-Examples.

- + \ No newline at end of file diff --git a/index.html b/index.html index 100b98e7..194042db 100644 --- a/index.html +++ b/index.html @@ -7,13 +7,13 @@ MS-AMP Documentation | MS-AMP - +

MS-AMP

Automatic mixed precision package for deep learning developed by Microsoft

- + \ No newline at end of file diff --git a/search/index.html b/search/index.html index 6005c47e..ae058183 100644 --- a/search/index.html +++ b/search/index.html @@ -7,13 +7,13 @@ Search the documentation | MS-AMP - +

Search the documentation

- + \ No newline at end of file