diff --git a/v2.13/404.html b/v2.13/404.html
index 62a5d77b07..38f30b39ec 100644
--- a/v2.13/404.html
+++ b/v2.13/404.html
@@ -1 +1 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link rel=icon href=/v2.13/images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Run:ai Documentation Library</title><link rel=stylesheet href=/v2.13/assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=/v2.13/css/timeago.css><link rel=stylesheet href=/v2.13/stylesheets/extra.css><script>__md_scope=new URL("/v2.13",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><script id=__analytics>function __md_analytics(){window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)},ga.l=+new Date,ga("create","UA-122651141-1","auto"),ga("set","anonymizeIp",!0),ga("send","pageview"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){var e;this.value&&(e=document.location.pathname,ga("send","pageview",e+"?q="+this.value))}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");ga("send","event","feedback","click",t,e),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){ga("send","pageview",e.pathname)})});var e=document.createElement("script");e.async=!0,e.src="https://www.google-analytics.com/analytics.js",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script><script>"undefined"!=typeof __md_analytics&&__md_analytics()</script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=/v2.13/. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=/v2.13/images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=/v2.13/. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=/v2.13/admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=/v2.13/Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=/v2.13/developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=/v2.13/. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=/v2.13/images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <h1>Document Not Found</h1> <p>The link you have used does not point to an existing document. Please search for the content on the top right, use the navigation bar to find what you are looking for or submit a ticket <a href=https://runai.secure.force.com/casesupport/CreateCaseForm target=_self>here</a>. </article> </div> <script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var tab,labels=set.querySelector(".tabbed-labels");for(tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script> </div> </main> <footer class=md-footer> <div class="md-footer-meta md-typeset"> <div class="md-footer-meta__inner md-grid"> <div class=md-copyright> <div class=md-copyright__highlight> Copyright © 2020 - 2023 Run:ai </div> Made with <a href=https://squidfunk.github.io/mkdocs-material/ target=_blank rel=noopener> Material for MkDocs </a> </div> <div class=md-social> <a href=https://twitter.com/runailabs target=_blank rel=noopener title=twitter.com class=md-social__link> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 512 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M459.37 151.716c.325 4.548.325 9.097.325 13.645 0 138.72-105.583 298.558-298.558 298.558-59.452 0-114.68-17.219-161.137-47.106 8.447.974 16.568 1.299 25.34 1.299 49.055 0 94.213-16.568 130.274-44.832-46.132-.975-84.792-31.188-98.112-72.772 6.498.974 12.995 1.624 19.818 1.624 9.421 0 18.843-1.3 27.614-3.573-48.081-9.747-84.143-51.98-84.143-102.985v-1.299c13.969 7.797 30.214 12.67 47.431 13.319-28.264-18.843-46.781-51.005-46.781-87.391 0-19.492 5.197-37.36 14.294-52.954 51.655 63.675 129.3 105.258 216.365 109.807-1.624-7.797-2.599-15.918-2.599-24.04 0-57.828 46.782-104.934 104.934-104.934 30.213 0 57.502 12.67 76.67 33.137 23.715-4.548 46.456-13.32 66.599-25.34-7.798 24.366-24.366 44.833-46.132 57.827 21.117-2.273 41.584-8.122 60.426-16.243-14.292 20.791-32.161 39.308-52.628 54.253z"/></svg> </a> <a href=https://linkedin.com/company/runailabs/ target=_blank rel=noopener title=linkedin.com class=md-social__link> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M416 32H31.9C14.3 32 0 46.5 0 64.3v383.4C0 465.5 14.3 480 31.9 480H416c17.6 0 32-14.5 32-32.3V64.3c0-17.8-14.4-32.3-32-32.3zM135.4 416H69V202.2h66.5V416zm-33.2-243c-21.3 0-38.5-17.3-38.5-38.5S80.9 96 102.2 96c21.2 0 38.5 17.3 38.5 38.5 0 21.3-17.2 38.5-38.5 38.5zm282.1 243h-66.4V312c0-24.8-.5-56.7-34.5-56.7-34.6 0-39.9 27-39.9 54.9V416h-66.4V202.2h63.7v29.2h.9c8.9-16.8 30.6-34.5 62.9-34.5 67.2 0 79.7 44.3 79.7 101.9V416z"/></svg> </a> </div> </div> </div> </footer> </div> <div class=md-dialog data-md-component=dialog> <div class="md-dialog__inner md-typeset"></div> </div> <script id=__config type=application/json>{"base": "/v2.13", "features": ["navigation.tabs", "search.highlight", "content.code.annotate", "content.tabs.link", "search.suggest", "content.action.edit"], "search": "/v2.13/assets/javascripts/workers/search.74e28a9f.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": {"provider": "mike"}}</script> <script src=/v2.13/assets/javascripts/bundle.220ee61c.min.js></script> <script src=/v2.13/js/timeago.min.js></script> <script src=/v2.13/js/timeago_mkdocs_material.js></script> </body> </html>
\ No newline at end of file
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link rel=icon href=/v2.13/images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Run:ai Documentation Library</title><link rel=stylesheet href=/v2.13/assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=/v2.13/css/timeago.css><link rel=stylesheet href=/v2.13/stylesheets/extra.css><script>__md_scope=new URL("/v2.13",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><script id=__analytics>function __md_analytics(){window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)},ga.l=+new Date,ga("create","UA-122651141-1","auto"),ga("set","anonymizeIp",!0),ga("send","pageview"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){var e;this.value&&(e=document.location.pathname,ga("send","pageview",e+"?q="+this.value))}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");ga("send","event","feedback","click",t,e),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){ga("send","pageview",e.pathname)})});var e=document.createElement("script");e.async=!0,e.src="https://www.google-analytics.com/analytics.js",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script><script>"undefined"!=typeof __md_analytics&&__md_analytics()</script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=/v2.13/. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=/v2.13/images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=/v2.13/. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=/v2.13/admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=/v2.13/Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=/v2.13/developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=/v2.13/. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=/v2.13/images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=/v2.13/home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=/v2.13/admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=/v2.13/developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=/v2.13/developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <h1>Document Not Found</h1> <p>The link you have used does not point to an existing document. Please search for the content on the top right, use the navigation bar to find what you are looking for or submit a ticket <a href=https://runai.secure.force.com/casesupport/CreateCaseForm target=_self>here</a>. </article> </div> <script>var tabs=__md_get("__tabs");if(Array.isArray(tabs))e:for(var set of document.querySelectorAll(".tabbed-set")){var tab,labels=set.querySelector(".tabbed-labels");for(tab of tabs)for(var label of labels.getElementsByTagName("label"))if(label.innerText.trim()===tab){var input=document.getElementById(label.htmlFor);input.checked=!0;continue e}}</script> </div> </main> <footer class=md-footer> <div class="md-footer-meta md-typeset"> <div class="md-footer-meta__inner md-grid"> <div class=md-copyright> <div class=md-copyright__highlight> Copyright © 2020 - 2023 Run:ai </div> Made with <a href=https://squidfunk.github.io/mkdocs-material/ target=_blank rel=noopener> Material for MkDocs </a> </div> <div class=md-social> <a href=https://twitter.com/runailabs target=_blank rel=noopener title=twitter.com class=md-social__link> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 512 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M459.37 151.716c.325 4.548.325 9.097.325 13.645 0 138.72-105.583 298.558-298.558 298.558-59.452 0-114.68-17.219-161.137-47.106 8.447.974 16.568 1.299 25.34 1.299 49.055 0 94.213-16.568 130.274-44.832-46.132-.975-84.792-31.188-98.112-72.772 6.498.974 12.995 1.624 19.818 1.624 9.421 0 18.843-1.3 27.614-3.573-48.081-9.747-84.143-51.98-84.143-102.985v-1.299c13.969 7.797 30.214 12.67 47.431 13.319-28.264-18.843-46.781-51.005-46.781-87.391 0-19.492 5.197-37.36 14.294-52.954 51.655 63.675 129.3 105.258 216.365 109.807-1.624-7.797-2.599-15.918-2.599-24.04 0-57.828 46.782-104.934 104.934-104.934 30.213 0 57.502 12.67 76.67 33.137 23.715-4.548 46.456-13.32 66.599-25.34-7.798 24.366-24.366 44.833-46.132 57.827 21.117-2.273 41.584-8.122 60.426-16.243-14.292 20.791-32.161 39.308-52.628 54.253z"/></svg> </a> <a href=https://linkedin.com/company/runailabs/ target=_blank rel=noopener title=linkedin.com class=md-social__link> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M416 32H31.9C14.3 32 0 46.5 0 64.3v383.4C0 465.5 14.3 480 31.9 480H416c17.6 0 32-14.5 32-32.3V64.3c0-17.8-14.4-32.3-32-32.3zM135.4 416H69V202.2h66.5V416zm-33.2-243c-21.3 0-38.5-17.3-38.5-38.5S80.9 96 102.2 96c21.2 0 38.5 17.3 38.5 38.5 0 21.3-17.2 38.5-38.5 38.5zm282.1 243h-66.4V312c0-24.8-.5-56.7-34.5-56.7-34.6 0-39.9 27-39.9 54.9V416h-66.4V202.2h63.7v29.2h.9c8.9-16.8 30.6-34.5 62.9-34.5 67.2 0 79.7 44.3 79.7 101.9V416z"/></svg> </a> </div> </div> </div> </footer> </div> <div class=md-dialog data-md-component=dialog> <div class="md-dialog__inner md-typeset"></div> </div> <script id=__config type=application/json>{"base": "/v2.13", "features": ["navigation.tabs", "search.highlight", "content.code.annotate", "content.tabs.link", "search.suggest", "content.action.edit"], "search": "/v2.13/assets/javascripts/workers/search.74e28a9f.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": {"provider": "mike"}}</script> <script src=/v2.13/assets/javascripts/bundle.220ee61c.min.js></script> <script src=/v2.13/js/timeago.min.js></script> <script src=/v2.13/js/timeago_mkdocs_material.js></script> </body> </html>
\ No newline at end of file
diff --git a/v2.13/Researcher/Walkthroughs/quickstart-inference/index.html b/v2.13/Researcher/Walkthroughs/quickstart-inference/index.html
index c089282d51..21dc439d1c 100644
--- a/v2.13/Researcher/Walkthroughs/quickstart-inference/index.html
+++ b/v2.13/Researcher/Walkthroughs/quickstart-inference/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-inference/ rel=canonical><link href=../walkthrough-queue-fairness/ rel=prev><link href=../quickstart-mig/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Inference - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-inference/ rel=canonical><link href=../walkthrough-queue-fairness/ rel=prev><link href=../quickstart-mig/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Inference - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/quickstart-mig/index.html b/v2.13/Researcher/Walkthroughs/quickstart-mig/index.html
index d3b55189e2..96fe3beb93 100644
--- a/v2.13/Researcher/Walkthroughs/quickstart-mig/index.html
+++ b/v2.13/Researcher/Walkthroughs/quickstart-mig/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-mig/ rel=canonical><link href=../quickstart-inference/ rel=prev><link href=../../user-interface/workspaces/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Dynamic MIG - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-mig/ rel=canonical><link href=../quickstart-inference/ rel=prev><link href=../../user-interface/workspaces/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Dynamic MIG - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/quickstart-overview/index.html b/v2.13/Researcher/Walkthroughs/quickstart-overview/index.html
index fb2b68cc9b..e70060b0b5 100644
--- a/v2.13/Researcher/Walkthroughs/quickstart-overview/index.html
+++ b/v2.13/Researcher/Walkthroughs/quickstart-overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-overview/ rel=canonical><link href=../../overview-researcher/ rel=prev><link href=../walkthrough-train/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Run:ai Quickstart Guides - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-overview/ rel=canonical><link href=../../overview-researcher/ rel=prev><link href=../walkthrough-train/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Run:ai Quickstart Guides - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/index.html
index bf3460b8a1..da078ec494 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/ rel=canonical><link href=../walkthrough-build/ rel=prev><link href=../walkthrough-fractions/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Build with Connected Ports - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/ rel=canonical><link href=../walkthrough-build/ rel=prev><link href=../walkthrough-fractions/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Build with Connected Ports - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-build/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-build/index.html
index b7278024a0..9f382d34b6 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-build/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-build/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build/ rel=canonical><link href=../walkthrough-train/ rel=prev><link href=../walkthrough-build-ports/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Build - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build/ rel=canonical><link href=../walkthrough-train/ rel=prev><link href=../walkthrough-build-ports/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Build - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/index.html
index 3a2eafa6ab..6955c806f9 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/ rel=canonical><link href=../walkthrough-fractions/ rel=prev><link href=../walkthrough-hpo/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Distributed Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/ rel=canonical><link href=../walkthrough-fractions/ rel=prev><link href=../walkthrough-hpo/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Distributed Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-fractions/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-fractions/index.html
index f2f3716532..8581c2067d 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-fractions/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-fractions/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-fractions/ rel=canonical><link href=../walkthrough-build-ports/ rel=prev><link href=../walkthrough-distributed-training/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>GPU Fractions - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-fractions/ rel=canonical><link href=../walkthrough-build-ports/ rel=prev><link href=../walkthrough-distributed-training/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>GPU Fractions - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-hpo/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-hpo/index.html
index 2339cb337c..07a6074b29 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-hpo/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-hpo/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-hpo/ rel=canonical><link href=../walkthrough-distributed-training/ rel=prev><link href=../walkthrough-overquota/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Hyperparameter Optimization - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-hpo/ rel=canonical><link href=../walkthrough-distributed-training/ rel=prev><link href=../walkthrough-overquota/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Hyperparameter Optimization - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-overquota/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-overquota/index.html
index 9518888d7c..df0a377201 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-overquota/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-overquota/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-overquota/ rel=canonical><link href=../walkthrough-hpo/ rel=prev><link href=../walkthrough-queue-fairness/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Over-Quota, Basic Fairness & Bin-Packing - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-overquota/ rel=canonical><link href=../walkthrough-hpo/ rel=prev><link href=../walkthrough-queue-fairness/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Over-Quota, Basic Fairness & Bin-Packing - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html
index f60aebf9b3..b87c27c84e 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/ rel=canonical><link href=../walkthrough-overquota/ rel=prev><link href=../quickstart-inference/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Queue Fairness - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/ rel=canonical><link href=../walkthrough-overquota/ rel=prev><link href=../quickstart-inference/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Queue Fairness - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/Walkthroughs/walkthrough-train/index.html b/v2.13/Researcher/Walkthroughs/walkthrough-train/index.html
index eabad42f72..9519a79c1d 100644
--- a/v2.13/Researcher/Walkthroughs/walkthrough-train/index.html
+++ b/v2.13/Researcher/Walkthroughs/walkthrough-train/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-train/ rel=canonical><link href=../quickstart-overview/ rel=prev><link href=../walkthrough-build/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Training - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-train/ rel=canonical><link href=../quickstart-overview/ rel=prev><link href=../walkthrough-build/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Training - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/best-practices/bare-metal-to-docker-images/index.html b/v2.13/Researcher/best-practices/bare-metal-to-docker-images/index.html
index f4fc922ec8..0d4ec1bf24 100644
--- a/v2.13/Researcher/best-practices/bare-metal-to-docker-images/index.html
+++ b/v2.13/Researcher/best-practices/bare-metal-to-docker-images/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/bare-metal-to-docker-images/ rel=canonical><link href=../../cli-reference/runai-whoami/ rel=prev><link href=../convert-to-unattended/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Bare-Metal to Docker Images - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/bare-metal-to-docker-images/ rel=canonical><link href=../../cli-reference/runai-whoami/ rel=prev><link href=../convert-to-unattended/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Bare-Metal to Docker Images - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/best-practices/convert-to-unattended/index.html b/v2.13/Researcher/best-practices/convert-to-unattended/index.html
index 68358ba00f..b9a05dbbf3 100644
--- a/v2.13/Researcher/best-practices/convert-to-unattended/index.html
+++ b/v2.13/Researcher/best-practices/convert-to-unattended/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/convert-to-unattended/ rel=canonical><link href=../bare-metal-to-docker-images/ rel=prev><link href=../save-dl-checkpoints/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Convert a Workload to Run Unattended - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/convert-to-unattended/ rel=canonical><link href=../bare-metal-to-docker-images/ rel=prev><link href=../save-dl-checkpoints/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Convert a Workload to Run Unattended - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/best-practices/env-variables/index.html b/v2.13/Researcher/best-practices/env-variables/index.html
index 8f8276f0a7..30d14b118f 100644
--- a/v2.13/Researcher/best-practices/env-variables/index.html
+++ b/v2.13/Researcher/best-practices/env-variables/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/env-variables/ rel=canonical><link href=../save-dl-checkpoints/ rel=prev><link href=../../scheduling/the-runai-scheduler/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Environment Variables - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/env-variables/ rel=canonical><link href=../save-dl-checkpoints/ rel=prev><link href=../../scheduling/the-runai-scheduler/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Environment Variables - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/best-practices/save-dl-checkpoints/index.html b/v2.13/Researcher/best-practices/save-dl-checkpoints/index.html
index 60c4664f0f..7dc9ff2470 100644
--- a/v2.13/Researcher/best-practices/save-dl-checkpoints/index.html
+++ b/v2.13/Researcher/best-practices/save-dl-checkpoints/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/save-dl-checkpoints/ rel=canonical><link href=../convert-to-unattended/ rel=prev><link href=../env-variables/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Save Deep Learning Checkpoints - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/best-practices/save-dl-checkpoints/ rel=canonical><link href=../convert-to-unattended/ rel=prev><link href=../env-variables/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Save Deep Learning Checkpoints - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/Introduction/index.html b/v2.13/Researcher/cli-reference/Introduction/index.html
index d6529e6630..5839e238de 100644
--- a/v2.13/Researcher/cli-reference/Introduction/index.html
+++ b/v2.13/Researcher/cli-reference/Introduction/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/Introduction/ rel=canonical><link href=../../user-interface/trainings/ rel=prev><link href=../runai-attach/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/Introduction/ rel=canonical><link href=../../user-interface/trainings/ rel=prev><link href=../runai-attach/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-attach/index.html b/v2.13/Researcher/cli-reference/runai-attach/index.html
index 56470bf6f8..8928f45d39 100644
--- a/v2.13/Researcher/cli-reference/runai-attach/index.html
+++ b/v2.13/Researcher/cli-reference/runai-attach/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-attach/ rel=canonical><link href=../Introduction/ rel=prev><link href=../runai-bash/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai attach - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-attach/ rel=canonical><link href=../Introduction/ rel=prev><link href=../runai-bash/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai attach - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -20,7 +20,7 @@
     .
     [--loglevel value] 
     [--help | -h]
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <h4 id=-no-stdin>--no-stdin<a class=headerlink href=#-no-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Do not attach STDIN.</p> </blockquote> <h4 id=-no-tty>--no-tty<a class=headerlink href=#-no-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Do not allocate a pseudo-TTY</p> </blockquote> <h4 id=-pod-string>--pod string<a class=headerlink href=#-pod-string title="Permanent link">&para;</a></h4> <blockquote> <p>Attach to a specific pod within the Job. To find the list of pods run <code>runai describe job &lt;job-name&gt;</code> and then use the pod name with the <code>--pod</code> flag.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>None</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-09-22T15:54:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-09-22</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <h4 id=-no-stdin>--no-stdin<a class=headerlink href=#-no-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Do not attach STDIN.</p> </blockquote> <h4 id=-no-tty>--no-tty<a class=headerlink href=#-no-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Do not allocate a pseudo-TTY</p> </blockquote> <h4 id=-pod-string>--pod string<a class=headerlink href=#-pod-string title="Permanent link">&para;</a></h4> <blockquote> <p>Attach to a specific pod within the Job. To find the list of pods run <code>runai describe job &lt;job-name&gt;</code> and then use the pod name with the <code>--pod</code> flag.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>None</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-09-22T15:54:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-09-22</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-bash/index.html b/v2.13/Researcher/cli-reference/runai-bash/index.html
index f026ad6f04..544783f133 100644
--- a/v2.13/Researcher/cli-reference/runai-bash/index.html
+++ b/v2.13/Researcher/cli-reference/runai-bash/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-bash/ rel=canonical><link href=../runai-attach/ rel=prev><link href=../runai-config/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai bash - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-bash/ rel=canonical><link href=../runai-attach/ rel=prev><link href=../runai-config/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai bash - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -19,7 +19,7 @@
 <a id=__codelineno-0-4 name=__codelineno-0-4 href=#__codelineno-0-4></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <h4 id=-pod-string>--pod string<a class=headerlink href=#-pod-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify a pod of a running Job. To get a list of the pods of a specific Job, run <code>runai describe job &lt;job-name&gt;</code> command</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.</p> <p>The command will return an error if the container does not exist or has not been in a running state yet.</p> <h2 id=see-also>See also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>Build Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-build/ >Launch Interactive Build Workloads</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <h4 id=-pod-string>--pod string<a class=headerlink href=#-pod-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify a pod of a running Job. To get a list of the pods of a specific Job, run <code>runai describe job &lt;job-name&gt;</code> command</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.</p> <p>The command will return an error if the container does not exist or has not been in a running state yet.</p> <h2 id=see-also>See also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>Build Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-build/ >Launch Interactive Build Workloads</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-config/index.html b/v2.13/Researcher/cli-reference/runai-config/index.html
index 09ebfda551..3fa1ab750a 100644
--- a/v2.13/Researcher/cli-reference/runai-config/index.html
+++ b/v2.13/Researcher/cli-reference/runai-config/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-config/ rel=canonical><link href=../runai-bash/ rel=prev><link href=../runai-delete/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai config - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-config/ rel=canonical><link href=../runai-bash/ rel=prev><link href=../runai-delete/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai config - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -20,7 +20,7 @@
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a>runai<span class=w>  </span>config<span class=w> </span>cluster<span class=w> </span>&lt;cluster-name&gt;
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-7 name=__codelineno-0-7 href=#__codelineno-0-7></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;project-name> - The name of the Project you want to set as default. Mandatory.</p> <p>&lt;cluster-name> - The name of the cluster you want to set as the current cluster. Mandatory.</p> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>None</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-08-09T07:58:19+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-08-09</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;project-name&gt; - The name of the Project you want to set as default. Mandatory.</p> <p>&lt;cluster-name&gt; - The name of the cluster you want to set as the current cluster. Mandatory.</p> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>None</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-08-09T07:58:19+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-08-09</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-delete/index.html b/v2.13/Researcher/cli-reference/runai-delete/index.html
index 8af9b0f79e..944e741041 100644
--- a/v2.13/Researcher/cli-reference/runai-delete/index.html
+++ b/v2.13/Researcher/cli-reference/runai-delete/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-delete/ rel=canonical><link href=../runai-config/ rel=prev><link href=../runai-describe/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai delete - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-delete/ rel=canonical><link href=../runai-config/ rel=prev><link href=../runai-describe/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai delete - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -19,7 +19,7 @@
 <a id=__codelineno-0-4 name=__codelineno-0-4 href=#__codelineno-0-4></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Workload to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Delete all Workloads.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li> <p>The Workload will be deleted and not available via the command <em>runai list jobs</em>.</p> </li> <li> <p>The Workloads will show as <code>deleted</code> from the Run:ai user interface Job list.</p> </li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li> <p>Build Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-build/ >Launch Interactive Build Workloads</a>.</p> </li> <li> <p>Training Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-train/ >Launch Unattended Training Workloads</a>.</p> </li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-05-10T06:15:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-05-10</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Delete all Workloads.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li> <p>The Workload will be deleted and not available via the command <em>runai list jobs</em>.</p> </li> <li> <p>The Workloads will show as <code>deleted</code> from the Run:ai user interface Job list.</p> </li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li> <p>Build Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-build/ >Launch Interactive Build Workloads</a>.</p> </li> <li> <p>Training Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-train/ >Launch Unattended Training Workloads</a>.</p> </li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-05-10T06:15:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-05-10</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-describe/index.html b/v2.13/Researcher/cli-reference/runai-describe/index.html
index f304466edc..00477c92f3 100644
--- a/v2.13/Researcher/cli-reference/runai-describe/index.html
+++ b/v2.13/Researcher/cli-reference/runai-describe/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-describe/ rel=canonical><link href=../runai-delete/ rel=prev><link href=../runai-exec/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai describe - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-describe/ rel=canonical><link href=../runai-delete/ rel=prev><link href=../runai-exec/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai describe - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -26,7 +26,7 @@
 <a id=__codelineno-0-11 name=__codelineno-0-11 href=#__codelineno-0-11></a>
 <a id=__codelineno-0-12 name=__codelineno-0-12 href=#__codelineno-0-12></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-13 name=__codelineno-0-13 href=#__codelineno-0-13></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <ul> <li>&lt;job-name> - The name of the Workload to run the command with. Mandatory.</li> <li>&lt;node-name> - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.</li> </ul> <p>-o | --output</p> <blockquote> <p>Output format. One of: json|yaml|wide. Default is 'wide'</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The <code>runai describe job</code> command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.</li> <li>The <code>runai describe node</code> command will show Node properties. </li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-11-24T16:01:59+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-11-24</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-11-16T17:33:44+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-11-16</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <ul> <li>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</li> <li>&lt;node-name&gt; - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.</li> </ul> <p>-o | --output</p> <blockquote> <p>Output format. One of: json|yaml|wide. Default is 'wide'</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The <code>runai describe job</code> command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.</li> <li>The <code>runai describe node</code> command will show Node properties. </li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-11-24T16:01:59+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-11-24</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-11-16T17:33:44+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-11-16</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-exec/index.html b/v2.13/Researcher/cli-reference/runai-exec/index.html
index ee9e6ee2bc..c9ae4fad01 100644
--- a/v2.13/Researcher/cli-reference/runai-exec/index.html
+++ b/v2.13/Researcher/cli-reference/runai-exec/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-exec/ rel=canonical><link href=../runai-describe/ rel=prev><link href=../runai-list/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai exec - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-exec/ rel=canonical><link href=../runai-describe/ rel=prev><link href=../runai-list/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai exec - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -20,7 +20,7 @@
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-7 name=__codelineno-0-7 href=#__codelineno-0-7></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <p>&lt;command> the command itself (e.g. <em>bash</em>).</p> <h4 id=-stdin-i>--stdin | -i<a class=headerlink href=#-stdin-i title="Permanent link">&para;</a></h4> <blockquote> <p>Keep STDIN open even if not attached.</p> </blockquote> <h4 id=-tty-t>--tty | -t<a class=headerlink href=#-tty-t title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will run in the context of the container.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <p>&lt;command&gt; the command itself (e.g. <em>bash</em>).</p> <h4 id=-stdin-i>--stdin | -i<a class=headerlink href=#-stdin-i title="Permanent link">&para;</a></h4> <blockquote> <p>Keep STDIN open even if not attached.</p> </blockquote> <h4 id=-tty-t>--tty | -t<a class=headerlink href=#-tty-t title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will run in the context of the container.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-list/index.html b/v2.13/Researcher/cli-reference/runai-list/index.html
index d99f56e139..8ef57834b5 100644
--- a/v2.13/Researcher/cli-reference/runai-list/index.html
+++ b/v2.13/Researcher/cli-reference/runai-list/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-list/ rel=canonical><link href=../runai-exec/ rel=prev><link href=../runai-login/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai list - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-list/ rel=canonical><link href=../runai-exec/ rel=prev><link href=../runai-login/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai list - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-login/index.html b/v2.13/Researcher/cli-reference/runai-login/index.html
index f4d7cd8efd..8f3dc3b430 100644
--- a/v2.13/Researcher/cli-reference/runai-login/index.html
+++ b/v2.13/Researcher/cli-reference/runai-login/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-login/ rel=canonical><link href=../runai-list/ rel=prev><link href=../runai-logout/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai login - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-login/ rel=canonical><link href=../runai-list/ rel=prev><link href=../runai-logout/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai login - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-logout/index.html b/v2.13/Researcher/cli-reference/runai-logout/index.html
index ead6286972..10f46dc985 100644
--- a/v2.13/Researcher/cli-reference/runai-logout/index.html
+++ b/v2.13/Researcher/cli-reference/runai-logout/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logout/ rel=canonical><link href=../runai-login/ rel=prev><link href=../runai-logs/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai logout - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logout/ rel=canonical><link href=../runai-login/ rel=prev><link href=../runai-logs/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai logout - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-logs/index.html b/v2.13/Researcher/cli-reference/runai-logs/index.html
index 0c56a1fb03..e25357c675 100644
--- a/v2.13/Researcher/cli-reference/runai-logs/index.html
+++ b/v2.13/Researcher/cli-reference/runai-logs/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logs/ rel=canonical><link href=../runai-logout/ rel=prev><link href=../runai-port-forwarding/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai logs - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logs/ rel=canonical><link href=../runai-logout/ rel=prev><link href=../runai-port-forwarding/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai logs - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -24,7 +24,7 @@
 <a id=__codelineno-0-9 name=__codelineno-0-9 href=#__codelineno-0-9></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-10 name=__codelineno-0-10 href=#__codelineno-0-10></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span><span class=w> </span>
 <a id=__codelineno-0-11 name=__codelineno-0-11 href=#__codelineno-0-11></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <h4 id=-follow-f>--follow | -f<a class=headerlink href=#-follow-f title="Permanent link">&para;</a></h4> <blockquote> <p>Stream the logs.</p> </blockquote> <h4 id=-pod-p>--pod | -p<a class=headerlink href=#-pod-p title="Permanent link">&para;</a></h4> <blockquote> <p>Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running <code>runai describe job &lt;job-name&gt;</code>.</p> </blockquote> <h4 id=-instance-string-i-string>--instance (string) | -i (string)<a class=headerlink href=#-instance-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Show logs for a specific instance in cases where a Job contains multiple pods.</p> </blockquote> <h4 id=-since-duration>--since (duration)<a class=headerlink href=#-since-duration title="Permanent link">&para;</a></h4> <blockquote> <p>Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.</p> </blockquote> <h4 id=-since-time-date-time>--since-time (date-time)<a class=headerlink href=#-since-time-date-time title="Permanent link">&para;</a></h4> <blockquote> <p>Return logs after specified date. Date format should be <em>RFC3339</em>, example: <code>2020-01-26T15:00:00Z</code>.</p> </blockquote> <h4 id=-tail-int-t-int>--tail (int) | -t (int)<a class=headerlink href=#-tail-int-t-int title="Permanent link">&para;</a></h4> <blockquote> <p># of lines of recent log file to display.</p> </blockquote> <h4 id=-timestamps>--timestamps<a class=headerlink href=#-timestamps title="Permanent link">&para;</a></h4> <blockquote> <p>Include timestamps on each line in the log output.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Training Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-train/ >Launch Unattended Training Workloads</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <h4 id=-follow-f>--follow | -f<a class=headerlink href=#-follow-f title="Permanent link">&para;</a></h4> <blockquote> <p>Stream the logs.</p> </blockquote> <h4 id=-pod-p>--pod | -p<a class=headerlink href=#-pod-p title="Permanent link">&para;</a></h4> <blockquote> <p>Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running <code>runai describe job &lt;job-name&gt;</code>.</p> </blockquote> <h4 id=-instance-string-i-string>--instance (string) | -i (string)<a class=headerlink href=#-instance-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Show logs for a specific instance in cases where a Job contains multiple pods.</p> </blockquote> <h4 id=-since-duration>--since (duration)<a class=headerlink href=#-since-duration title="Permanent link">&para;</a></h4> <blockquote> <p>Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.</p> </blockquote> <h4 id=-since-time-date-time>--since-time (date-time)<a class=headerlink href=#-since-time-date-time title="Permanent link">&para;</a></h4> <blockquote> <p>Return logs after specified date. Date format should be <em>RFC3339</em>, example: <code>2020-01-26T15:00:00Z</code>.</p> </blockquote> <h4 id=-tail-int-t-int>--tail (int) | -t (int)<a class=headerlink href=#-tail-int-t-int title="Permanent link">&para;</a></h4> <blockquote> <p># of lines of recent log file to display.</p> </blockquote> <h4 id=-timestamps>--timestamps<a class=headerlink href=#-timestamps title="Permanent link">&para;</a></h4> <blockquote> <p>Include timestamps on each line in the log output.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Training Workloads. See Quickstart document: <a href=../../Walkthroughs/walkthrough-train/ >Launch Unattended Training Workloads</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-port-forwarding/index.html b/v2.13/Researcher/cli-reference/runai-port-forwarding/index.html
index 884d248359..f763ee8157 100644
--- a/v2.13/Researcher/cli-reference/runai-port-forwarding/index.html
+++ b/v2.13/Researcher/cli-reference/runai-port-forwarding/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-port-forwarding/ rel=canonical><link href=../runai-logs/ rel=prev><link href=../runai-resume/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai port-forward - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-port-forwarding/ rel=canonical><link href=../runai-logs/ rel=prev><link href=../runai-resume/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai port-forward - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-resume/index.html b/v2.13/Researcher/cli-reference/runai-resume/index.html
index 65999c0df0..49238c1e09 100644
--- a/v2.13/Researcher/cli-reference/runai-resume/index.html
+++ b/v2.13/Researcher/cli-reference/runai-resume/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-resume/ rel=canonical><link href=../runai-port-forwarding/ rel=prev><link href=../runai-submit/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai resume - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-resume/ rel=canonical><link href=../runai-port-forwarding/ rel=prev><link href=../runai-submit/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai resume - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -19,7 +19,7 @@
 <a id=__codelineno-0-4 name=__codelineno-0-4 href=#__codelineno-0-4></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span>
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span>
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Resume all suspended Jobs in the current Project.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The Job will be resumed. When running <em>runai list jobs</em> the Job status will no longer by <em>Suspended</em>.</li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Suspending Jobs: <a href=../runai-suspend/ >Suspend</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-06-29T13:09:23+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-06-29</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Resume all suspended Jobs in the current Project.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The Job will be resumed. When running <em>runai list jobs</em> the Job status will no longer by <em>Suspended</em>.</li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Suspending Jobs: <a href=../runai-suspend/ >Suspend</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-06-29T13:09:23+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-06-29</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-submit-dist-TF/index.html b/v2.13/Researcher/cli-reference/runai-submit-dist-TF/index.html
index dc9236bb01..61e3b8c99f 100644
--- a/v2.13/Researcher/cli-reference/runai-submit-dist-TF/index.html
+++ b/v2.13/Researcher/cli-reference/runai-submit-dist-TF/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-TF/ rel=canonical><link href=../runai-submit-dist-pytorch/ rel=prev><link href=../runai-submit-dist-xgboost/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai submit-dist tf - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-TF/ rel=canonical><link href=../runai-submit-dist-pytorch/ rel=prev><link href=../runai-submit-dist-xgboost/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai submit-dist tf - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,11 +13,11 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist tf </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist tf <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist tf </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-TF.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist tf</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Version 2.10 and later.</p> <p>Submit a distributed TensorFlow training run:ai job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the &lt; insert TensorFlow operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist tf --name distributed-job --workers=2 -g 1 \</span>
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist tf </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist tf <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist tf </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-TF.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist tf</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Version 2.10 and later.</p> <p>Submit a distributed TensorFlow training run:ai job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the &lt; insert TensorFlow operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist tf --name distributed-job --workers=2 -g 1 \</span>
 <a id=__codelineno-0-2 name=__codelineno-0-2 href=#__codelineno-0-2></a><span class=go>    -i &lt;image_name</span>
 <a id=__codelineno-0-3 name=__codelineno-0-3 href=#__codelineno-0-3></a><span class=go>&gt;</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
-</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-16T12:23:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-16</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6.</p> </blockquote> <h4 id=-ttl-after-finish-duration>--ttl-after-finish &lt; duration &gt;<a class=headerlink href=#-ttl-after-finish-duration title="Permanent link">&para;</a></h4> <blockquote> <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
+</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-31T08:49:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-31</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/index.html b/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/index.html
index 198a6c9d85..f1dd64c7b8 100644
--- a/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/index.html
+++ b/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/ rel=canonical><link href=../runai-submit/ rel=prev><link href=../runai-submit-dist-pytorch/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai submit-dist mpi - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/ rel=canonical><link href=../runai-submit/ rel=prev><link href=../runai-submit-dist-pytorch/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai submit-dist mpi - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,10 +13,10 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist mpi </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist mpi <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist mpi </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-slots-per-worker-int class=md-nav__link> --slots-per-worker &lt; int &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-slots-per-worker-int class=md-nav__link> --slots-per-worker &lt; int &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-mpi.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist mpi</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a Distributed Training (MPI) Run:ai Job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the Kubeflow MPI Operator as specified <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training-via-kubeflow-mpi>here</a></p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <p>You can start an unattended mpi training Job of name dist1, based on Project <em>team-a</em> using a <em>quickstart-distributed</em> image:</p> <div class=highlight><pre><span></span><code>runai submit-dist mpi --name dist1 --workers=2 -g 1 \
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist mpi </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist mpi <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist mpi </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-slots-per-worker-int class=md-nav__link> --slots-per-worker &lt; int &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-slots-per-worker-int class=md-nav__link> --slots-per-worker &lt; int &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-mpi.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist mpi</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a Distributed Training (MPI) Run:ai Job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the Kubeflow MPI Operator as specified <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training-via-kubeflow-mpi>here</a></p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <p>You can start an unattended mpi training Job of name dist1, based on Project <em>team-a</em> using a <em>quickstart-distributed</em> image:</p> <div class=highlight><pre><span></span><code>runai submit-dist mpi --name dist1 --workers=2 -g 1 \
     -i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60
-</code></pre></div> <p>(see: <a href=../../Walkthroughs/walkthrough-distributed-training/ >distributed training Quickstart</a>).</p> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int &gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs.</p> </blockquote> <h4 id=-slots-per-worker-int>--slots-per-worker &lt; int &gt;<a class=headerlink href=#-slots-per-worker-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of slots to allocate for each worker.</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
-</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-16T12:23:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-16</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <p>(see: <a href=../../Walkthroughs/walkthrough-distributed-training/ >distributed training Quickstart</a>).</p> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int &gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs.</p> </blockquote> <h4 id=-slots-per-worker-int>--slots-per-worker &lt; int &gt;<a class=headerlink href=#-slots-per-worker-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of slots to allocate for each worker.</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6.</p> </blockquote> <h4 id=-ttl-after-finish-duration>--ttl-after-finish &lt; duration &gt;<a class=headerlink href=#-ttl-after-finish-duration title="Permanent link">&para;</a></h4> <blockquote> <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
+</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-31T08:49:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-31</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/index.html b/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/index.html
index d3b40243a1..523e570de3 100644
--- a/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/index.html
+++ b/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/ rel=canonical><link href=../runai-submit-dist-mpi/ rel=prev><link href=../runai-submit-dist-TF/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai submit-dist pytorch - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/ rel=canonical><link href=../runai-submit-dist-mpi/ rel=prev><link href=../runai-submit-dist-TF/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai submit-dist pytorch - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,10 +13,10 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist pytorch </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist pytorch <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist pytorch </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-max-replicas-int class=md-nav__link> --max-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-min-replicas-int class=md-nav__link> --min-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-max-replicas-int class=md-nav__link> --max-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-min-replicas-int class=md-nav__link> --min-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-pytorch.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist pytorch</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Version 2.10 and later.</p> <p>Submit a distributed PyTorch training run:ai job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the &lt; insert pytorch operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \</span>
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist pytorch </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist pytorch <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist pytorch </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-max-replicas-int class=md-nav__link> --max-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-min-replicas-int class=md-nav__link> --min-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-max-replicas-int class=md-nav__link> --max-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-min-replicas-int class=md-nav__link> --min-replicas &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-pytorch.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist pytorch</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Version 2.10 and later.</p> <p>Submit a distributed PyTorch training run:ai job to run.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To use distributed training you need to have installed the &lt; insert pytorch operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> </div> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \</span>
 <a id=__codelineno-0-2 name=__codelineno-0-2 href=#__codelineno-0-2></a><span class=go>    -i &lt;image_name&gt;</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-max-replicas-int>--max-replicas &lt; int &gt;<a class=headerlink href=#-max-replicas-int title="Permanent link">&para;</a></h4> <blockquote> <p>Maximum number of replicas for elastic PyTorch job.</p> </blockquote> <h4 id=-min-replicas-int>--min-replicas &lt; int &gt;<a class=headerlink href=#-min-replicas-int title="Permanent link">&para;</a></h4> <blockquote> <p>Minimum number of replicas for elastic PyTorch job.</p> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
-</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>&lt; please let me know if this is needed, or if additional documentation is needed in the link &gt; * See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-16T12:23:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-16</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-max-replicas-int>--max-replicas &lt; int &gt;<a class=headerlink href=#-max-replicas-int title="Permanent link">&para;</a></h4> <blockquote> <p>Maximum number of replicas for elastic PyTorch job.</p> </blockquote> <h4 id=-min-replicas-int>--min-replicas &lt; int &gt;<a class=headerlink href=#-min-replicas-int title="Permanent link">&para;</a></h4> <blockquote> <p>Minimum number of replicas for elastic PyTorch job.</p> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6.</p> </blockquote> <h4 id=-ttl-after-finish-duration>--ttl-after-finish &lt; duration &gt;<a class=headerlink href=#-ttl-after-finish-duration title="Permanent link">&para;</a></h4> <blockquote> <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
+</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>&lt; please let me know if this is needed, or if additional documentation is needed in the link &gt; * See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-31T08:49:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-31</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/index.html b/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/index.html
index 59c2ce1e85..51ad74c856 100644
--- a/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/index.html
+++ b/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/ rel=canonical><link href=../runai-submit-dist-TF/ rel=prev><link href=../runai-suspend/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai submit-dist xgboost - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/ rel=canonical><link href=../runai-submit-dist-TF/ rel=prev><link href=../runai-suspend/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai submit-dist xgboost - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,11 +13,11 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist xgboost </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist xgboost <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist xgboost </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-xgboost.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist xgboost</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a distributed XGBoost training run:ai job to run.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \</span>
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit-dist xgboost </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit-dist xgboost <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit-dist xgboost </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed class=md-nav__link> Distributed </a> <nav class=md-nav aria-label=Distributed> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-clean-pod-policy-string class=md-nav__link> --clean-pod-policy &lt; string &gt; </a> </li> <li class=md-nav__item> <a href=#-workers-int class=md-nav__link> --workers &lt; int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit-dist-xgboost.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit-dist xgboost</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a distributed XGBoost training run:ai job to run.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a><span class=go>runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \</span>
 <a id=__codelineno-0-2 name=__codelineno-0-2 href=#__codelineno-0-2></a><span class=go>    -i &lt;image_name</span>
 <a id=__codelineno-0-3 name=__codelineno-0-3 href=#__codelineno-0-3></a><span class=go>&gt;</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
-</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-16T12:23:15+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-16</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=distributed>Distributed<a class=headerlink href=#distributed title="Permanent link">&para;</a></h3> <h4 id=-clean-pod-policy-string>--clean-pod-policy &lt; string &gt;<a class=headerlink href=#-clean-pod-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>The CleanPodPolicy controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li><strong>Running</strong>&mdash;only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li><strong>All</strong>&mdash;all (including completed) pods will be deleted immediately when the job finishes.</li> <li><strong>None</strong>&mdash;no pods will be deleted when the job completes.</li> </ul> </blockquote> <h4 id=-workers-int>--workers &lt; int&gt;<a class=headerlink href=#-workers-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of replicas for Inference jobs</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6.</p> </blockquote> <h4 id=-ttl-after-finish-duration>--ttl-after-finish &lt; duration &gt;<a class=headerlink href=#-ttl-after-finish-duration title="Permanent link">&para;</a></h4> <blockquote> <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
+</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit an <em>mpi</em> Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See Quickstart document <a href=../../Walkthroughs/walkthrough-distributed-training/ >Running Distributed Training</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-31T08:49:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-31</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-07T15:37:58+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-07</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-submit/index.html b/v2.13/Researcher/cli-reference/runai-submit/index.html
index 2de2c1e436..1a4305e0ac 100644
--- a/v2.13/Researcher/cli-reference/runai-submit/index.html
+++ b/v2.13/Researcher/cli-reference/runai-submit/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit/ rel=canonical><link href=../runai-resume/ rel=prev><link href=../runai-submit-dist-mpi/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai submit - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit/ rel=canonical><link href=../runai-resume/ rel=prev><link href=../runai-submit-dist-mpi/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai submit - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,7 +13,7 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#job-type class=md-nav__link> Job Type </a> <nav class=md-nav aria-label="Job Type"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-interactive class=md-nav__link> --interactive </a> </li> <li class=md-nav__item> <a href=#-jupyter class=md-nav__link> --jupyter </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-completions-int class=md-nav__link> --completions &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-parallelism-int class=md-nav__link> --parallelism &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-preemptible class=md-nav__link> --preemptible </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle_1 class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-custom-url-string class=md-nav__link> --custom-url &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#job-type class=md-nav__link> Job Type </a> <nav class=md-nav aria-label="Job Type"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-interactive class=md-nav__link> --interactive </a> </li> <li class=md-nav__item> <a href=#-jupyter class=md-nav__link> --jupyter </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-completions-int class=md-nav__link> --completions &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-parallelism-int class=md-nav__link> --parallelism &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-preemptible class=md-nav__link> --preemptible </a> </li> <li class=md-nav__item> <a href=#-ttl-after-finish-duration class=md-nav__link> --ttl-after-finish &lt; duration &gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle_1 class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-custom-url-string class=md-nav__link> --custom-url &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a Run:ai Job for execution.</p> <p>Syntax notes:</p> <ul> <li>Flags of type <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <p>All examples assume a Run:ai Project has been setup using <code>runai config project &lt;project-name&gt;</code>.</p> <p>Start an interactive Job:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>runai submit -i ubuntu --interactive --attach -g 1
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#description class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> runai submit </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../overview-researcher/ class="md-tabs__link md-tabs__link--active"> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3 checked> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=true> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4 checked> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=true> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../runai-resume/ class=md-nav__link> runai resume </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> runai submit <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> runai submit </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#job-type class=md-nav__link> Job Type </a> <nav class=md-nav aria-label="Job Type"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-interactive class=md-nav__link> --interactive </a> </li> <li class=md-nav__item> <a href=#-jupyter class=md-nav__link> --jupyter </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-completions-int class=md-nav__link> --completions &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-parallelism-int class=md-nav__link> --parallelism &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-preemptible class=md-nav__link> --preemptible </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle_1 class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-custom-url-string class=md-nav__link> --custom-url &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#description class=md-nav__link> Description </a> </li> <li class=md-nav__item> <a href=#examples class=md-nav__link> Examples </a> </li> <li class=md-nav__item> <a href=#options class=md-nav__link> Options </a> <nav class=md-nav aria-label=Options> <ul class=md-nav__list> <li class=md-nav__item> <a href=#job-type class=md-nav__link> Job Type </a> <nav class=md-nav aria-label="Job Type"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-interactive class=md-nav__link> --interactive </a> </li> <li class=md-nav__item> <a href=#-jupyter class=md-nav__link> --jupyter </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-completions-int class=md-nav__link> --completions &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-parallelism-int class=md-nav__link> --parallelism &lt; int &gt; </a> </li> <li class=md-nav__item> <a href=#-preemptible class=md-nav__link> --preemptible </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#naming-and-shortcuts class=md-nav__link> Naming and Shortcuts </a> <nav class=md-nav aria-label="Naming and Shortcuts"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-job-name-prefix-string class=md-nav__link> --job-name-prefix &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-name-string class=md-nav__link> --name &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-template-string class=md-nav__link> --template &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#container-definition class=md-nav__link> Container Definition </a> <nav class=md-nav aria-label="Container Definition"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-add-capability-stringarray class=md-nav__link> --add-capability &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-a-annotation-stringarray class=md-nav__link> -a | --annotation &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-attach class=md-nav__link> --attach </a> </li> <li class=md-nav__item> <a href=#-command class=md-nav__link> --command </a> </li> <li class=md-nav__item> <a href=#-create-home-dir class=md-nav__link> --create-home-dir </a> </li> <li class=md-nav__item> <a href=#-e-stringarray-environment class=md-nav__link> -e &lt;stringArray&gt; | --environment` </a> </li> <li class=md-nav__item> <a href=#-image-string-i-string class=md-nav__link> --image &lt;string&gt; | -i &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-image-pull-policy-string class=md-nav__link> --image-pull-policy &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-l-label-stringarray class=md-nav__link> -l | --label &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-preferred-pod-topology-key-string class=md-nav__link> --preferred-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-required-pod-topology-key-string class=md-nav__link> --required-pod-topology-key &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-stdin class=md-nav__link> --stdin </a> </li> <li class=md-nav__item> <a href=#-t-tty class=md-nav__link> -t | --tty </a> </li> <li class=md-nav__item> <a href=#-working-dir-string class=md-nav__link> --working-dir &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#resource-allocation class=md-nav__link> Resource Allocation </a> <nav class=md-nav aria-label="Resource Allocation"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-cpu-double class=md-nav__link> --cpu &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-cpu-limit-double class=md-nav__link> --cpu-limit &lt;double&gt; </a> </li> <li class=md-nav__item> <a href=#-extended-resource class=md-nav__link> --extended-resource ` </a> </li> <li class=md-nav__item> <a href=#-g-gpu-float class=md-nav__link> -g | --gpu &lt;float&gt; </a> </li> <li class=md-nav__item> <a href=#-gpu-memory class=md-nav__link> --gpu-memory </a> </li> <li class=md-nav__item> <a href=#-memory-string class=md-nav__link> --memory &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-memory-limit class=md-nav__link> --memory-limit ` </a> </li> <li class=md-nav__item> <a href=#-mig-profile-string class=md-nav__link> --mig-profile &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#job-lifecycle_1 class=md-nav__link> Job Lifecycle </a> <nav class=md-nav aria-label="Job Lifecycle"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-backoff-limit-int class=md-nav__link> --backoff-limit &lt;int&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#storage class=md-nav__link> Storage </a> <nav class=md-nav aria-label=Storage> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-git-sync-stringarray class=md-nav__link> --git-sync &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-large-shm class=md-nav__link> --large-shm </a> </li> <li class=md-nav__item> <a href=#-mount-propagation class=md-nav__link> --mount-propagation </a> </li> <li class=md-nav__item> <a href=#-nfs-server-string class=md-nav__link> --nfs-server &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-storage_class_namesizecontainer_mount_pathro class=md-nav__link> --pvc [Storage_Class_Name]:Size:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-pvc_namecontainer_mount_pathro class=md-nav__link> --pvc Pvc_Name:Container_Mount_Path:[ro] </a> </li> <li class=md-nav__item> <a href=#-pvc-exists-string class=md-nav__link> --pvc-exists &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-pvc-new-string class=md-nav__link> --pvc-new &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-s3-string class=md-nav__link> --s3 &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-v-volume-sourcecontainer_mount_pathronfs-host class=md-nav__link> -v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]' </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#network class=md-nav__link> Network </a> <nav class=md-nav aria-label=Network> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-address-string class=md-nav__link> --address &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-host-ipc class=md-nav__link> --host-ipc </a> </li> <li class=md-nav__item> <a href=#-host-network class=md-nav__link> --host-network </a> </li> <li class=md-nav__item> <a href=#-port-stringarray class=md-nav__link> --port &lt;stringArray&gt; </a> </li> <li class=md-nav__item> <a href=#-s-service-type-string class=md-nav__link> -s | --service-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-custom-url-string class=md-nav__link> --custom-url &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#access-control class=md-nav__link> Access Control </a> <nav class=md-nav aria-label="Access Control"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-allow-privilege-escalation class=md-nav__link> --allow-privilege-escalation </a> </li> <li class=md-nav__item> <a href=#-run-as-user class=md-nav__link> --run-as-user </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#scheduling class=md-nav__link> Scheduling </a> <nav class=md-nav aria-label=Scheduling> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-node-pools-string class=md-nav__link> --node-pools &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-node-type-string class=md-nav__link> --node-type &lt;string&gt; </a> </li> <li class=md-nav__item> <a href=#-toleration-string class=md-nav__link> --toleration &lt;string&gt; </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#global-flags class=md-nav__link> Global Flags </a> <nav class=md-nav aria-label="Global Flags"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#-loglevel-string class=md-nav__link> --loglevel (string) </a> </li> <li class=md-nav__item> <a href=#-project-p-string class=md-nav__link> --project | -p (string) </a> </li> <li class=md-nav__item> <a href=#-help-h class=md-nav__link> --help | -h </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#output class=md-nav__link> Output </a> </li> <li class=md-nav__item> <a href=#see-also class=md-nav__link> See Also </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/Researcher/cli-reference/runai-submit.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>runai submit</h1> <h2 id=description>Description<a class=headerlink href=#description title="Permanent link">&para;</a></h2> <p>Submit a Run:ai Job for execution.</p> <p>Syntax notes:</p> <ul> <li>Flags of type <em>stringArray</em> mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul> <h2 id=examples>Examples<a class=headerlink href=#examples title="Permanent link">&para;</a></h2> <p>All examples assume a Run:ai Project has been setup using <code>runai config project &lt;project-name&gt;</code>.</p> <p>Start an interactive Job:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>runai submit -i ubuntu --interactive --attach -g 1
 </code></pre></div> <p>Or</p> <div class=highlight><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a><span class=go>runai submit --name build1 -i ubuntu -g 1 --interactive -- sleep infinity </span>
 </code></pre></div> <p>(see: <a href=../../Walkthroughs/walkthrough-build/ >build Quickstart</a>).</p> <p>Externalize ports:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-2-1 name=__codelineno-2-1 href=#__codelineno-2-1></a><span class=go>runai submit --name build-remote -i rastasheep/ubuntu-sshd:14.04 --interactive \</span>
 <a id=__codelineno-2-2 name=__codelineno-2-2 href=#__codelineno-2-2></a><span class=go>   --service-type=nodeport --port 30022:22</span>
@@ -26,8 +26,8 @@
 </code></pre></div> <p>Submit a job using the system autogenerated name to an external URL:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-7-1 name=__codelineno-7-1 href=#__codelineno-7-1></a><span class=go>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745 --custom-url=&lt;destination_url&gt;</span>
 </code></pre></div> <p>Submit a job without a name to a system generated a URL :</p> <div class=highlight><pre><span></span><code><a id=__codelineno-8-1 name=__codelineno-8-1 href=#__codelineno-8-1></a><span class=go>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745</span>
 </code></pre></div> <p>Submit a Job without a name with a pre-defined prefix and an incremental index suffix</p> <div class=highlight><pre><span></span><code><a id=__codelineno-9-1 name=__codelineno-9-1 href=#__codelineno-9-1></a><span class=go>runai submit --job-name-prefix -i gcr.io/run-ai-demo/quickstart -g 1 </span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=job-type>Job Type<a class=headerlink href=#job-type title="Permanent link">&para;</a></h3> <h4 id=-interactive>--interactive<a class=headerlink href=#-interactive title="Permanent link">&para;</a></h4> <blockquote> <p>Mark this Job as interactive.</p> </blockquote> <h4 id=-jupyter>--jupyter<a class=headerlink href=#-jupyter title="Permanent link">&para;</a></h4> <blockquote> <p>Run a Jupyter notebook using a default image and notebook configuration.</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-completions-int>--completions &lt; int &gt;<a class=headerlink href=#-completions-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of successful pods required for this job to be completed. Used with HPO.</p> </blockquote> <h4 id=-parallelism-int>--parallelism &lt; int &gt;<a class=headerlink href=#-parallelism-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of pods to run in parallel at any given time. Used with HPO.</p> </blockquote> <h4 id=-preemptible>--preemptible<a class=headerlink href=#-preemptible title="Permanent link">&para;</a></h4> <blockquote> <p>Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.</p> </blockquote> <h4 id=-ttl-after-finish-duration>--ttl-after-finish &lt; duration &gt;<a class=headerlink href=#-ttl-after-finish-duration title="Permanent link">&para;</a></h4> <blockquote> <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle_1>Job Lifecycle<a class=headerlink href=#job-lifecycle_1 title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are:</p> <ul> <li><code>portforward</code> (deprecated)</li> <li><code>loadbalancer</code></li> <li><code>nodeport</code></li> <li><code>external-url</code></li> </ul> </blockquote> <h4 id=-custom-url-string>--custom-url <code>&lt;string&gt;</code><a class=headerlink href=#-custom-url-string title="Permanent link">&para;</a></h4> <blockquote> <p>An optional argument that specifies a custom URL when using the <code>external URL</code> service type. If not provided, the system will generate a URL automatically.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-10-1 name=__codelineno-10-1 href=#__codelineno-10-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
-</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit a Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <p>Note that the submit call may use a <em>policy</em> to provide defaults to any of the above flags.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See any of the Quickstart documents <a href=../../Walkthroughs/quickstart-overview/ >here:</a>.</li> <li>See <a href=../../../admin/workloads/policies/ >policy configuration</a> for a description on how policies work.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-19T05:57:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-19</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <h3 id=job-type>Job Type<a class=headerlink href=#job-type title="Permanent link">&para;</a></h3> <h4 id=-interactive>--interactive<a class=headerlink href=#-interactive title="Permanent link">&para;</a></h4> <blockquote> <p>Mark this Job as interactive.</p> </blockquote> <h4 id=-jupyter>--jupyter<a class=headerlink href=#-jupyter title="Permanent link">&para;</a></h4> <blockquote> <p>Run a Jupyter notebook using a default image and notebook configuration.</p> </blockquote> <h3 id=job-lifecycle>Job Lifecycle<a class=headerlink href=#job-lifecycle title="Permanent link">&para;</a></h3> <h4 id=-completions-int>--completions &lt; int &gt;<a class=headerlink href=#-completions-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of successful pods required for this job to be completed. Used with HPO.</p> </blockquote> <h4 id=-parallelism-int>--parallelism &lt; int &gt;<a class=headerlink href=#-parallelism-int title="Permanent link">&para;</a></h4> <blockquote> <p>Number of pods to run in parallel at any given time. Used with HPO.</p> </blockquote> <h4 id=-preemptible>--preemptible<a class=headerlink href=#-preemptible title="Permanent link">&para;</a></h4> <blockquote> <p>Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.</p> </blockquote> <!-- Start of common content from snippets/common-submit-cli-commands.md --> <h3 id=naming-and-shortcuts>Naming and Shortcuts<a class=headerlink href=#naming-and-shortcuts title="Permanent link">&para;</a></h3> <h4 id=-job-name-prefix-string>--job-name-prefix <code>&lt;string&gt;</code><a class=headerlink href=#-job-name-prefix-string title="Permanent link">&para;</a></h4> <blockquote> <p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p> </blockquote> <h4 id=-name-string>--name <code>&lt;string&gt;</code><a class=headerlink href=#-name-string title="Permanent link">&para;</a></h4> <blockquote> <p>The name of the Job.</p> </blockquote> <h4 id=-template-string>--template <code>&lt;string&gt;</code><a class=headerlink href=#-template-string title="Permanent link">&para;</a></h4> <blockquote> <p>Load default values from a workload.</p> </blockquote> <h3 id=container-definition>Container Definition<a class=headerlink href=#container-definition title="Permanent link">&para;</a></h3> <h4 id=-add-capability-stringarray>--add-capability <code>&lt;stringArray&gt;</code><a class=headerlink href=#-add-capability-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Add linux capabilities to the container.</p> </blockquote> <h4 id=-a-annotation-stringarray>-a | --annotation <code>&lt;stringArray&gt;</code><a class=headerlink href=#-a-annotation-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set annotations variables in the container.</p> </blockquote> <h4 id=-attach>--attach<a class=headerlink href=#-attach title="Permanent link">&para;</a></h4> <blockquote> <p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command <a href=../runai-attach/ >runai attach</a>.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p> </blockquote> <h4 id=-command>--command<a class=headerlink href=#-command title="Permanent link">&para;</a></h4> <blockquote> <p>Overrides the image's entry point with the command supplied after '--'. When <strong>not</strong> using the <code>--command</code> flag, the entry point will <strong>not</strong> be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p> </blockquote> <h4 id=-create-home-dir>--create-home-dir<a class=headerlink href=#-create-home-dir title="Permanent link">&para;</a></h4> <blockquote> <p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h4 id=-e-stringarray-environment>-e <code>&lt;stringArray&gt; | --environment</code><stringarray>`<a class=headerlink href=#-e-stringarray-environment title="Permanent link">&para;</a></h4> <blockquote> <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>). <!-- or separate by a comma (`-e BATCH_SIZE:50,LEARNING_RATE:0.2`). --> </p> </blockquote> <h4 id=-image-string-i-string>--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code><a class=headerlink href=#-image-string-i-string title="Permanent link">&para;</a></h4> <blockquote> <p>Image to use when creating the container for this Job</p> </blockquote> <h4 id=-image-pull-policy-string>--image-pull-policy <code>&lt;string&gt;</code><a class=headerlink href=#-image-pull-policy-string title="Permanent link">&para;</a></h4> <blockquote> <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes <a href=https://kubernetes.io/docs/concepts/configuration/overview/#container-images target=_blank>documentation</a>.</p> </blockquote> <h4 id=-l-label-stringarray>-l | --label <code>&lt;stringArray&gt;</code><a class=headerlink href=#-l-label-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Set labels variables in the container.</p> </blockquote> <h4 id=-preferred-pod-topology-key-string>--preferred-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-preferred-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-required-pod-topology-key-string>--required-pod-topology-key <code>&lt;string&gt;</code><a class=headerlink href=#-required-pod-topology-key-string title="Permanent link">&para;</a></h4> <blockquote> <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p> </blockquote> <h4 id=-stdin>--stdin<a class=headerlink href=#-stdin title="Permanent link">&para;</a></h4> <blockquote> <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p> </blockquote> <h4 id=-t-tty>-t | --tty<a class=headerlink href=#-t-tty title="Permanent link">&para;</a></h4> <blockquote> <p>Allocate a pseudo-TTY.</p> </blockquote> <h4 id=-working-dir-string>--working-dir <code>&lt;string&gt;</code><a class=headerlink href=#-working-dir-string title="Permanent link">&para;</a></h4> <blockquote> <p>Starts the container with the specified directory as the current directory.</p> </blockquote> <h3 id=resource-allocation>Resource Allocation<a class=headerlink href=#resource-allocation title="Permanent link">&para;</a></h3> <h4 id=-cpu-double>--cpu <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-double title="Permanent link">&para;</a></h4> <blockquote> <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive <strong>at least</strong> this amount of CPU. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of CPUs to the Job.</p> </blockquote> <h4 id=-cpu-limit-double>--cpu-limit <code>&lt;double&gt;</code><a class=headerlink href=#-cpu-limit-double title="Permanent link">&para;</a></h4> <blockquote> <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p> </blockquote> <h4 id=-extended-resource>--extended-resource `<stringarray><a class=headerlink href=#-extended-resource title="Permanent link">&para;</a></h4> <blockquote> <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p> </blockquote> <h4 id=-g-gpu-float>-g | --gpu <code>&lt;float&gt;</code><a class=headerlink href=#-g-gpu-float title="Permanent link">&para;</a></h4> <blockquote> <p>GPU units to allocate for the Job (0.5, 1).</p> </blockquote> <h4 id=-gpu-memory>--gpu-memory<a class=headerlink href=#-gpu-memory title="Permanent link">&para;</a></h4> <blockquote> <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p> </blockquote> <h4 id=-memory-string>--memory <code>&lt;string&gt;</code><a class=headerlink href=#-memory-string title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive <strong>at least</strong> this amount of memory. Note that the Job will <strong>not</strong> be scheduled unless the system can guarantee this amount of memory to the Job.</p> </blockquote> <h4 id=-memory-limit>--memory-limit `<string><a class=headerlink href=#-memory-limit title="Permanent link">&para;</a></h4> <blockquote> <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p> </blockquote> <h4 id=-mig-profile-string>--mig-profile <code>&lt;string&gt;</code><a class=headerlink href=#-mig-profile-string title="Permanent link">&para;</a></h4> <blockquote> <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p> </blockquote> <h3 id=job-lifecycle_1>Job Lifecycle<a class=headerlink href=#job-lifecycle_1 title="Permanent link">&para;</a></h3> <h4 id=-backoff-limit-int>--backoff-limit <code>&lt;int&gt;</code><a class=headerlink href=#-backoff-limit-int title="Permanent link">&para;</a></h4> <blockquote> <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p> </blockquote> <h3 id=storage>Storage<a class=headerlink href=#storage title="Permanent link">&para;</a></h3> <h4 id=-git-sync-stringarray>--git-sync <code>&lt;stringArray&gt;</code><a class=headerlink href=#-git-sync-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p> </blockquote> <h4 id=-large-shm>--large-shm<a class=headerlink href=#-large-shm title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a large /dev/shm device.</p> </blockquote> <h4 id=-mount-propagation>--mount-propagation<a class=headerlink href=#-mount-propagation title="Permanent link">&para;</a></h4> <blockquote> <p>Enable HostToContainer mount propagation for all container volumes</p> </blockquote> <h4 id=-nfs-server-string>--nfs-server <code>&lt;string&gt;</code><a class=headerlink href=#-nfs-server-string title="Permanent link">&para;</a></h4> <blockquote> <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p> </blockquote> <h4 id=-pvc-storage_class_namesizecontainer_mount_pathro>--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-storage_class_namesizecontainer_mount_pathro title="Permanent link">&para;</a></h4> <h4 id=-pvc-pvc_namecontainer_mount_pathro>--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code><a class=headerlink href=#-pvc-pvc_namecontainer_mount_pathro title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim into a container.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p> </div> <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p><strong>Storage_Class_Name</strong> is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p><strong>Size</strong> is the volume size you want to allocate. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> for how to specify volume sizes</p> <p><strong>Container_Mount_Path</strong>. A path internal to the container where the storage will be mounted</p> <p><strong>Pvc_Name</strong>. The name of a pre-existing <a href=https://kubernetes.io/docs/concepts/storage/persistent-volumes/#dynamic target=_blank>Persistent Volume Claim</a> to mount into the container</p> <p>Examples:</p> <blockquote> <p><code>--pvc :3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code> - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p> </blockquote> </blockquote> <h4 id=-pvc-exists-string>--pvc-exists <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-exists-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li><strong>path</strong>&mdash;the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p> </blockquote> <h4 id=-pvc-new-string>--pvc-new <code>&lt;string&gt;</code><a class=headerlink href=#-pvc-new-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li><strong>claim name</strong>&mdash;The name of the persistent colume claim.</li> <li><strong>storage class</strong>&mdash;A storage class name that can be obtained by running</li> </ul> <blockquote> <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p> </blockquote> <ul> <li><strong>size</strong>&mdash;The volume size you want to allocate for the PVC when creating it. See <a href=https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ target=_blank>Kubernetes documentation</a> to specify volume sizes.</li> <li><strong>accessmode</strong>&mdash;The description of thr desired volume capabilities for the PVC.</li> <li><strong>ro</strong>&mdash;Mount the PVC with read-only access.</li> <li><strong>ephemeral</strong>&mdash;The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass= &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p> </blockquote> <h4 id=-s3-string>--s3 <code>&lt;string&gt;</code><a class=headerlink href=#-s3-string title="Permanent link">&para;</a></h4> <blockquote> <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p> </blockquote> <h4 id=-v-volume-sourcecontainer_mount_pathronfs-host>-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'<a class=headerlink href=#-v-volume-sourcecontainer_mount_pathronfs-host title="Permanent link">&para;</a></h4> <blockquote> <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p> </blockquote> <h3 id=network>Network<a class=headerlink href=#network title="Permanent link">&para;</a></h3> <h4 id=-address-string>--address <code>&lt;string&gt;</code><a class=headerlink href=#-address-string title="Permanent link">&para;</a></h4> <blockquote> <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p> </blockquote> <h4 id=-host-ipc>--host-ipc<a class=headerlink href=#-host-ipc title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's <em>ipc</em> namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a> documentation.</p> </blockquote> <h4 id=-host-network>--host-network<a class=headerlink href=#-host-network title="Permanent link">&para;</a></h4> <blockquote> <p>Use the host's network stack inside the container. For further information see <a href=https://docs.docker.com/engine/reference/run/ >docker run reference</a>documentation.</p> </blockquote> <h4 id=-port-stringarray>--port <code>&lt;stringArray&gt;</code><a class=headerlink href=#-port-stringarray title="Permanent link">&para;</a></h4> <blockquote> <p>Expose ports from the Job container.</p> </blockquote> <h4 id=-s-service-type-string>-s | --service-type <code>&lt;string&gt;</code><a class=headerlink href=#-s-service-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>External access type to interactive jobs. Options are:</p> <ul> <li><code>portforward</code> (deprecated)</li> <li><code>loadbalancer</code></li> <li><code>nodeport</code></li> <li><code>external-url</code></li> </ul> </blockquote> <h4 id=-custom-url-string>--custom-url <code>&lt;string&gt;</code><a class=headerlink href=#-custom-url-string title="Permanent link">&para;</a></h4> <blockquote> <p>An optional argument that specifies a custom URL when using the <code>external URL</code> service type. If not provided, the system will generate a URL automatically.</p> </blockquote> <h3 id=access-control>Access Control<a class=headerlink href=#access-control title="Permanent link">&para;</a></h3> <h4 id=-allow-privilege-escalation>--allow-privilege-escalation<a class=headerlink href=#-allow-privilege-escalation title="Permanent link">&para;</a></h4> <blockquote> <p>Allow the job to gain additional privileges after start.</p> </blockquote> <h4 id=-run-as-user>--run-as-user<a class=headerlink href=#-run-as-user title="Permanent link">&para;</a></h4> <blockquote> <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is <em>root</em> (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see <a href=../../../admin/runai-setup/config/non-root-containers/ >non root containers</a>.</p> </blockquote> <h3 id=scheduling>Scheduling<a class=headerlink href=#scheduling title="Permanent link">&para;</a></h3> <h4 id=-node-pools-string>--node-pools <code>&lt;string&gt;</code><a class=headerlink href=#-node-pools-string title="Permanent link">&para;</a></h4> <blockquote> <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a <a href=../../scheduling/the-runai-scheduler/ >Node Pool</a>. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a> or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: <a href=../../../admin/admin-ui-setup/project-setup/ >Working with Projects</a>.</p> </blockquote> <h4 id=-node-type-string>--node-type <code>&lt;string&gt;</code><a class=headerlink href=#-node-type-string title="Permanent link">&para;</a></h4> <blockquote> <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: <a href=../../../admin/researcher-setup/limit-to-node-group/ >Limit a Workload to a Specific Node Group</a>.</p> </blockquote> <h4 id=-toleration-string>--toleration <code>&lt;string&gt;</code><a class=headerlink href=#-toleration-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node. This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes <a href=https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ target=_blank>Taints and Tolerations</a> Guide.</p> <p>The format of the string:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-10-1 name=__codelineno-10-1 href=#__codelineno-10-1></a>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]
+</code></pre></div> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info")</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <!-- END of common content from snippets/common-submit-cli-commands.md --> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <p>The command will attempt to submit a Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <p>Note that the submit call may use a <em>policy</em> to provide defaults to any of the above flags.</p> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>See any of the Quickstart documents <a href=../../Walkthroughs/quickstart-overview/ >here:</a>.</li> <li>See <a href=../../../admin/workloads/policies/ >policy configuration</a> for a description on how policies work.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-31T08:49:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-31</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-suspend/index.html b/v2.13/Researcher/cli-reference/runai-suspend/index.html
index 9211de28c1..5b208f9bc9 100644
--- a/v2.13/Researcher/cli-reference/runai-suspend/index.html
+++ b/v2.13/Researcher/cli-reference/runai-suspend/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-suspend/ rel=canonical><link href=../runai-submit-dist-xgboost/ rel=prev><link href=../runai-top-node/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai suspend - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-suspend/ rel=canonical><link href=../runai-submit-dist-xgboost/ rel=prev><link href=../runai-top-node/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai suspend - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -19,7 +19,7 @@
 <a id=__codelineno-0-4 name=__codelineno-0-4 href=#__codelineno-0-4></a><span class=w>    </span><span class=o>[</span>--loglevel<span class=w> </span>value<span class=o>]</span>
 <a id=__codelineno-0-5 name=__codelineno-0-5 href=#__codelineno-0-5></a><span class=w>    </span><span class=o>[</span>--project<span class=w> </span>string<span class=w> </span><span class=p>|</span><span class=w> </span>-p<span class=w> </span>string<span class=o>]</span>
 <a id=__codelineno-0-6 name=__codelineno-0-6 href=#__codelineno-0-6></a><span class=w>    </span><span class=o>[</span>--help<span class=w> </span><span class=p>|</span><span class=w> </span>-h<span class=o>]</span>
-</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name> - The name of the Job to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Suspend all Jobs in the current Project.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The Job will be suspended. When running <em>runai list jobs</em> the Job will be marked as <em>Suspended</em>.</li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Resuming Jobs: <a href=../runai-resume/ >Resume</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <h2 id=options>Options<a class=headerlink href=#options title="Permanent link">&para;</a></h2> <p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <h4 id=-all-a>--all | -A<a class=headerlink href=#-all-a title="Permanent link">&para;</a></h4> <blockquote> <p>Suspend all Jobs in the current Project.</p> </blockquote> <h3 id=global-flags>Global Flags<a class=headerlink href=#global-flags title="Permanent link">&para;</a></h3> <h4 id=-loglevel-string>--loglevel (string)<a class=headerlink href=#-loglevel-string title="Permanent link">&para;</a></h4> <blockquote> <p>Set the logging level. One of: debug | info | warn | error (default "info").</p> </blockquote> <h4 id=-project-p-string>--project | -p (string)<a class=headerlink href=#-project-p-string title="Permanent link">&para;</a></h4> <blockquote> <p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p> </blockquote> <h4 id=-help-h>--help | -h<a class=headerlink href=#-help-h title="Permanent link">&para;</a></h4> <blockquote> <p>Show help text.</p> </blockquote> <h2 id=output>Output<a class=headerlink href=#output title="Permanent link">&para;</a></h2> <ul> <li>The Job will be suspended. When running <em>runai list jobs</em> the Job will be marked as <em>Suspended</em>.</li> </ul> <h2 id=see-also>See Also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <ul> <li>Resuming Jobs: <a href=../runai-resume/ >Resume</a>.</li> </ul> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2022-02-07T06:43:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2022-02-07</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/Researcher/cli-reference/runai-top-node/index.html b/v2.13/Researcher/cli-reference/runai-top-node/index.html
index e3ac23a924..d8ae2fedb6 100644
--- a/v2.13/Researcher/cli-reference/runai-top-node/index.html
+++ b/v2.13/Researcher/cli-reference/runai-top-node/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-top-node/ rel=canonical><link href=../runai-suspend/ rel=prev><link href=../runai-update/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai top node - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-top-node/ rel=canonical><link href=../runai-suspend/ rel=prev><link href=../runai-update/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai top node - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-update/index.html b/v2.13/Researcher/cli-reference/runai-update/index.html
index 2d3c860181..0c36719f2c 100644
--- a/v2.13/Researcher/cli-reference/runai-update/index.html
+++ b/v2.13/Researcher/cli-reference/runai-update/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-update/ rel=canonical><link href=../runai-top-node/ rel=prev><link href=../runai-version/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai update - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-update/ rel=canonical><link href=../runai-top-node/ rel=prev><link href=../runai-version/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai update - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-version/index.html b/v2.13/Researcher/cli-reference/runai-version/index.html
index 9912dd5679..54f22bad0d 100644
--- a/v2.13/Researcher/cli-reference/runai-version/index.html
+++ b/v2.13/Researcher/cli-reference/runai-version/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-version/ rel=canonical><link href=../runai-update/ rel=prev><link href=../runai-whoami/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai version - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-version/ rel=canonical><link href=../runai-update/ rel=prev><link href=../runai-whoami/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai version - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/cli-reference/runai-whoami/index.html b/v2.13/Researcher/cli-reference/runai-whoami/index.html
index 528a1f9883..3b4a90ee08 100644
--- a/v2.13/Researcher/cli-reference/runai-whoami/index.html
+++ b/v2.13/Researcher/cli-reference/runai-whoami/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-whoami/ rel=canonical><link href=../runai-version/ rel=prev><link href=../../best-practices/bare-metal-to-docker-images/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>runai whoami - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/cli-reference/runai-whoami/ rel=canonical><link href=../runai-version/ rel=prev><link href=../../best-practices/bare-metal-to-docker-images/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>runai whoami - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/overview-researcher/index.html b/v2.13/Researcher/overview-researcher/index.html
index 53e8155d5e..777217cf3e 100644
--- a/v2.13/Researcher/overview-researcher/index.html
+++ b/v2.13/Researcher/overview-researcher/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/overview-researcher/ rel=canonical><link href=../../admin/integration/ray/ rel=prev><link href=../Walkthroughs/quickstart-overview/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Researcher Documentation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/overview-researcher/ rel=canonical><link href=../../admin/integration/ray/ rel=prev><link href=../Walkthroughs/quickstart-overview/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Researcher Documentation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/index.html b/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/index.html
index b43c73e23a..7eb7d6f37d 100644
--- a/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/index.html
+++ b/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/ rel=canonical><link href=../fractions/ rel=prev><link href=../job-statuses/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Allocation of CPU and Memory - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/ rel=canonical><link href=../fractions/ rel=prev><link href=../job-statuses/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Allocation of CPU and Memory - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/fractions/index.html b/v2.13/Researcher/scheduling/fractions/index.html
index 0ecf75613a..d8b35f976c 100644
--- a/v2.13/Researcher/scheduling/fractions/index.html
+++ b/v2.13/Researcher/scheduling/fractions/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/fractions/ rel=canonical><link href=../the-runai-scheduler/ rel=prev><link href=../allocation-of-cpu-and-memory/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Allocation of GPU Fractions - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/fractions/ rel=canonical><link href=../the-runai-scheduler/ rel=prev><link href=../allocation-of-cpu-and-memory/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Allocation of GPU Fractions - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/hpo/index.html b/v2.13/Researcher/scheduling/hpo/index.html
index 6f260b95e5..8d4f22eced 100644
--- a/v2.13/Researcher/scheduling/hpo/index.html
+++ b/v2.13/Researcher/scheduling/hpo/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/hpo/ rel=canonical><link href=../using-node-pools/ rel=prev><link href=../../tools/dev-vscode/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Hyperparameter Optimization - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/hpo/ rel=canonical><link href=../using-node-pools/ rel=prev><link href=../../tools/dev-vscode/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Hyperparameter Optimization - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/job-statuses/index.html b/v2.13/Researcher/scheduling/job-statuses/index.html
index 1ac7fec635..882dddcc05 100644
--- a/v2.13/Researcher/scheduling/job-statuses/index.html
+++ b/v2.13/Researcher/scheduling/job-statuses/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/job-statuses/ rel=canonical><link href=../allocation-of-cpu-and-memory/ rel=prev><link href=../strategies/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Job Statuses - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/job-statuses/ rel=canonical><link href=../allocation-of-cpu-and-memory/ rel=prev><link href=../strategies/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Job Statuses - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/schedule-to-aws-groups/index.html b/v2.13/Researcher/scheduling/schedule-to-aws-groups/index.html
index 2417de0870..c5b2eff26a 100644
--- a/v2.13/Researcher/scheduling/schedule-to-aws-groups/index.html
+++ b/v2.13/Researcher/scheduling/schedule-to-aws-groups/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/schedule-to-aws-groups/ rel=canonical><link href=../strategies/ rel=prev><link href=../using-node-pools/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Scheduling workloads to AWS placement groups - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/schedule-to-aws-groups/ rel=canonical><link href=../strategies/ rel=prev><link href=../using-node-pools/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Scheduling workloads to AWS placement groups - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/strategies/index.html b/v2.13/Researcher/scheduling/strategies/index.html
index d740589524..4515697f18 100644
--- a/v2.13/Researcher/scheduling/strategies/index.html
+++ b/v2.13/Researcher/scheduling/strategies/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/strategies/ rel=canonical><link href=../job-statuses/ rel=prev><link href=../schedule-to-aws-groups/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Scheduling Strategies - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/strategies/ rel=canonical><link href=../job-statuses/ rel=prev><link href=../schedule-to-aws-groups/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Scheduling Strategies - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/the-runai-scheduler/index.html b/v2.13/Researcher/scheduling/the-runai-scheduler/index.html
index 7d4bc4567b..185812b4da 100644
--- a/v2.13/Researcher/scheduling/the-runai-scheduler/index.html
+++ b/v2.13/Researcher/scheduling/the-runai-scheduler/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/the-runai-scheduler/ rel=canonical><link href=../../best-practices/env-variables/ rel=prev><link href=../fractions/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>The Run:ai Scheduler - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/the-runai-scheduler/ rel=canonical><link href=../../best-practices/env-variables/ rel=prev><link href=../fractions/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>The Run:ai Scheduler - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/scheduling/using-node-pools/index.html b/v2.13/Researcher/scheduling/using-node-pools/index.html
index ebf3e53ec2..dc6a56d6bd 100644
--- a/v2.13/Researcher/scheduling/using-node-pools/index.html
+++ b/v2.13/Researcher/scheduling/using-node-pools/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/using-node-pools/ rel=canonical><link href=../schedule-to-aws-groups/ rel=prev><link href=../hpo/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Using Node Pools - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/scheduling/using-node-pools/ rel=canonical><link href=../schedule-to-aws-groups/ rel=prev><link href=../hpo/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Using Node Pools - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/tools/dev-jupyter/index.html b/v2.13/Researcher/tools/dev-jupyter/index.html
index 596348cc4a..8115a9b95b 100644
--- a/v2.13/Researcher/tools/dev-jupyter/index.html
+++ b/v2.13/Researcher/tools/dev-jupyter/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-jupyter/ rel=canonical><link href=../dev-x11forward-pycharm/ rel=prev><link href=../dev-tensorboard/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Jupyter Notebook - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-jupyter/ rel=canonical><link href=../dev-x11forward-pycharm/ rel=prev><link href=../dev-tensorboard/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Jupyter Notebook - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/tools/dev-pycharm/index.html b/v2.13/Researcher/tools/dev-pycharm/index.html
index cb4842c30b..ea39b18bdc 100644
--- a/v2.13/Researcher/tools/dev-pycharm/index.html
+++ b/v2.13/Researcher/tools/dev-pycharm/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-pycharm/ rel=canonical><link href=../dev-vscode/ rel=prev><link href=../dev-x11forward-pycharm/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>PyCharm - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-pycharm/ rel=canonical><link href=../dev-vscode/ rel=prev><link href=../dev-x11forward-pycharm/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>PyCharm - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/tools/dev-tensorboard/index.html b/v2.13/Researcher/tools/dev-tensorboard/index.html
index e5cce9d8f3..36dc44faa9 100644
--- a/v2.13/Researcher/tools/dev-tensorboard/index.html
+++ b/v2.13/Researcher/tools/dev-tensorboard/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-tensorboard/ rel=canonical><link href=../dev-jupyter/ rel=prev><link href=../../use-cases/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>TensorBoard - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-tensorboard/ rel=canonical><link href=../dev-jupyter/ rel=prev><link href=../../use-cases/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>TensorBoard - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/tools/dev-vscode/index.html b/v2.13/Researcher/tools/dev-vscode/index.html
index 438e0882c7..ff0f4e932d 100644
--- a/v2.13/Researcher/tools/dev-vscode/index.html
+++ b/v2.13/Researcher/tools/dev-vscode/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-vscode/ rel=canonical><link href=../../scheduling/hpo/ rel=prev><link href=../dev-pycharm/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Visual Studio Code - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-vscode/ rel=canonical><link href=../../scheduling/hpo/ rel=prev><link href=../dev-pycharm/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Visual Studio Code - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/tools/dev-x11forward-pycharm/index.html b/v2.13/Researcher/tools/dev-x11forward-pycharm/index.html
index d4736f66f4..0c4985defe 100644
--- a/v2.13/Researcher/tools/dev-x11forward-pycharm/index.html
+++ b/v2.13/Researcher/tools/dev-x11forward-pycharm/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-x11forward-pycharm/ rel=canonical><link href=../dev-pycharm/ rel=prev><link href=../dev-jupyter/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>X11 & PyCharm - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/tools/dev-x11forward-pycharm/ rel=canonical><link href=../dev-pycharm/ rel=prev><link href=../dev-jupyter/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>X11 & PyCharm - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/use-cases/index.html b/v2.13/Researcher/use-cases/index.html
index 5239b2564c..6ed1f4dccb 100644
--- a/v2.13/Researcher/use-cases/index.html
+++ b/v2.13/Researcher/use-cases/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/use-cases/ rel=canonical><link href=../tools/dev-tensorboard/ rel=prev><link href=../../developer/overview-developer/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Use Cases - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/use-cases/ rel=canonical><link href=../tools/dev-tensorboard/ rel=prev><link href=../../developer/overview-developer/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Use Cases - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/trainings/index.html b/v2.13/Researcher/user-interface/trainings/index.html
index e418747ea5..379981bbcf 100644
--- a/v2.13/Researcher/user-interface/trainings/index.html
+++ b/v2.13/Researcher/user-interface/trainings/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/trainings/ rel=canonical><link href=../workspaces/statuses/ rel=prev><link href=../../cli-reference/Introduction/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Trainings - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/trainings/ rel=canonical><link href=../workspaces/statuses/ rel=prev><link href=../../cli-reference/Introduction/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Trainings - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/index.html b/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/index.html
index 50b08c56dc..3223861df4 100644
--- a/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/ rel=canonical><link href=../../overview/ rel=prev><link href=../environments/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/ rel=canonical><link href=../../overview/ rel=prev><link href=../environments/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/blocks/compute/index.html b/v2.13/Researcher/user-interface/workspaces/blocks/compute/index.html
index 3342f14433..90ab90f0d8 100644
--- a/v2.13/Researcher/user-interface/workspaces/blocks/compute/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/blocks/compute/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/compute/ rel=canonical><link href=../environments/ rel=prev><link href=../datasources/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Compute Resources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/compute/ rel=canonical><link href=../environments/ rel=prev><link href=../datasources/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Compute Resources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/blocks/datasources/index.html b/v2.13/Researcher/user-interface/workspaces/blocks/datasources/index.html
index 0b353ffe95..468f3ee30d 100644
--- a/v2.13/Researcher/user-interface/workspaces/blocks/datasources/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/blocks/datasources/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/datasources/ rel=canonical><link href=../compute/ rel=prev><link href=../../create/create-env/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Data Sources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/datasources/ rel=canonical><link href=../compute/ rel=prev><link href=../../create/create-env/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Data Sources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/blocks/environments/index.html b/v2.13/Researcher/user-interface/workspaces/blocks/environments/index.html
index 1151c08dcb..9306aa386b 100644
--- a/v2.13/Researcher/user-interface/workspaces/blocks/environments/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/blocks/environments/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/environments/ rel=canonical><link href=../building-blocks/ rel=prev><link href=../compute/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Environments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/environments/ rel=canonical><link href=../building-blocks/ rel=prev><link href=../compute/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Environments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/create/create-compute/index.html b/v2.13/Researcher/user-interface/workspaces/create/create-compute/index.html
index 0936fc4547..449a285e24 100644
--- a/v2.13/Researcher/user-interface/workspaces/create/create-compute/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/create/create-compute/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-compute/ rel=canonical><link href=../create-env/ rel=prev><link href=../create-ds/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Compute Resources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-compute/ rel=canonical><link href=../create-env/ rel=prev><link href=../create-ds/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Compute Resources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/create/create-ds/index.html b/v2.13/Researcher/user-interface/workspaces/create/create-ds/index.html
index 4f196d950f..643f3a4d63 100644
--- a/v2.13/Researcher/user-interface/workspaces/create/create-ds/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/create/create-ds/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-ds/ rel=canonical><link href=../create-compute/ rel=prev><link href=../workspace/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Data Sources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-ds/ rel=canonical><link href=../create-compute/ rel=prev><link href=../workspace/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Data Sources - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/create/create-env/index.html b/v2.13/Researcher/user-interface/workspaces/create/create-env/index.html
index be404ff6ca..b2f5b768d9 100644
--- a/v2.13/Researcher/user-interface/workspaces/create/create-env/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/create/create-env/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-env/ rel=canonical><link href=../../blocks/datasources/ rel=prev><link href=../create-compute/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Environments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-env/ rel=canonical><link href=../../blocks/datasources/ rel=prev><link href=../create-compute/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Environments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/create/workspace/index.html b/v2.13/Researcher/user-interface/workspaces/create/workspace/index.html
index f509a302e8..c42499d0d9 100644
--- a/v2.13/Researcher/user-interface/workspaces/create/workspace/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/create/workspace/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/workspace/ rel=canonical><link href=../create-ds/ rel=prev><link href=../../statuses/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Workspaces - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/workspace/ rel=canonical><link href=../create-ds/ rel=prev><link href=../../statuses/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Workspaces - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/overview/index.html b/v2.13/Researcher/user-interface/workspaces/overview/index.html
index 81861e2495..92ce4511a8 100644
--- a/v2.13/Researcher/user-interface/workspaces/overview/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/overview/ rel=canonical><link href=../../../Walkthroughs/quickstart-mig/ rel=prev><link href=../blocks/building-blocks/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/overview/ rel=canonical><link href=../../../Walkthroughs/quickstart-mig/ rel=prev><link href=../blocks/building-blocks/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/Researcher/user-interface/workspaces/statuses/index.html b/v2.13/Researcher/user-interface/workspaces/statuses/index.html
index 5126292c16..d19ebbb732 100644
--- a/v2.13/Researcher/user-interface/workspaces/statuses/index.html
+++ b/v2.13/Researcher/user-interface/workspaces/statuses/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/statuses/ rel=canonical><link href=../create/workspace/ rel=prev><link href=../../trainings/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Statuses - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/statuses/ rel=canonical><link href=../create/workspace/ rel=prev><link href=../../trainings/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Statuses - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/admin-ui-users/index.html b/v2.13/admin/admin-ui-setup/admin-ui-users/index.html
index 5eb6aed201..39c705bd92 100644
--- a/v2.13/admin/admin-ui-setup/admin-ui-users/index.html
+++ b/v2.13/admin/admin-ui-setup/admin-ui-users/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/admin-ui-users/ rel=canonical><link href=../overview/ rel=prev><link href=../project-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Users - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/admin-ui-users/ rel=canonical><link href=../overview/ rel=prev><link href=../project-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Users - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,7 +13,7 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#adding-updating-and-deleting-users class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Users </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../overview-administrator/ class="md-tabs__link md-tabs__link--active"> Administrator </a> </li> <li class=md-tabs__item> <a href=../../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2 checked> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5 checked> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=true> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Users <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Users </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#introduction class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=#working-with-users class=md-nav__link> Working with Users </a> <nav class=md-nav aria-label="Working with Users"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#create-a-user class=md-nav__link> Create a User </a> </li> <li class=md-nav__item> <a href=#roles-and-permissions class=md-nav__link> Roles and permissions </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#introduction class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=#working-with-users class=md-nav__link> Working with Users </a> <nav class=md-nav aria-label="Working with Users"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#create-a-user class=md-nav__link> Create a User </a> </li> <li class=md-nav__item> <a href=#roles-and-permissions class=md-nav__link> Roles and permissions </a> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/admin/admin-ui-setup/admin-ui-users.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id=adding-updating-and-deleting-users>Adding, Updating and Deleting Users<a class=headerlink href=#adding-updating-and-deleting-users title="Permanent link">&para;</a></h1> <h2 id=introduction>Introduction<a class=headerlink href=#introduction title="Permanent link">&para;</a></h2> <p>The Run:ai User Interface allows the creation of Run:ai Users. Run:ai Users can receive varying levels of access to the Administration UI and submit Jobs on the Cluster.</p> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>It is possible to connect the Run:ai user interface to the organization's directory and use single sign-on. This allows you to set Run:ai roles for users and groups from the organizational directory. For further information see <a href=../../runai-setup/authentication/sso/ >single sign-on configuration</a>.</p> </div> <h2 id=working-with-users>Working with Users<a class=headerlink href=#working-with-users title="Permanent link">&para;</a></h2> <p>You can create users, as well as update and delete users.</p> <h3 id=create-a-user>Create a User<a class=headerlink href=#create-a-user title="Permanent link">&para;</a></h3> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To be able to review, add, update and delete users, you must have an <em>Administrator</em> access. If you do not have such access, please contact an Administrator.</p> </div> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Department Admin is available in version 2.10 and later.</p> <ol> <li>Login to the Users area of the Run:ai User interface at <code>company-name.run.ai</code>.</li> <li>Select the <code>Users</code> tab for local users, or the <code>SSO Users</code> tab for SSO users.</li> <li>On the top right, select "NEW USER".</li> <li>Enter the user's email.</li> <li> <p>Select Roles. More than one role can be selected. Available roles are:</p> <ul> <li><strong>Administrator</strong>&mdash;Can manage Users and install Clusters.</li> <li><strong>Editor</strong>&mdash;Can manage Projects and Departments.</li> <li><strong>Viewer</strong>&mdash;View-only access to the Run:ai User Interface.</li> <li><strong>Researcher</strong>&mdash;Can submit ML workloads. Setting a user as a <em>Researcher</em> also requires <a href=../project-setup/#create-a-new-project.md>assigning the user to projects</a>.</li> <li><strong>Research Manager</strong>&mdash;Can act as <em>Researcher</em> in all projects, including new ones to be created in the future.</li> <li><strong>ML Engineer</strong>&mdash;Can view and manage deployments and cluster resources. Available only when <a href=../../workloads/inference-overview/ >Inference module is installed</a>.</li> <li><strong>Department Administrator</strong>&mdash;Can manage Departments, descendent Projects and Workloads.</li> </ul> <p>For more information, <a href=#roles-and-permissions>Roles and permissions</a>.</p> </li> <li> <p>(Optional) Select Cluster(s). This determines what Clusters are accessible to this User.</p> </li> <li>Press "Save".</li> </ol> <p>You will get the new user credentials and have the option to send the credentials by email.</p> <h3 id=roles-and-permissions>Roles and permissions<a class=headerlink href=#roles-and-permissions title="Permanent link">&para;</a></h3> <p>Roles provide a way to group permissions and assign them to either users or user groups. The role identifies the collection of permissions that administrators assign to users or user groups. Permissions define the actions that users can perform on the managed entities. The following table shows the default roles and permissions.</p> <table> <thead> <tr> <th align=left>Managed Entity / Roles</th> <th align=left>Admin</th> <th align=left>Dep. Admin</th> <th align=left>Editor</th> <th align=left>Research Manager</th> <th align=left>Researcher</th> <th align=left>ML Eng.</th> <th align=left>Viewer</th> </tr> </thead> <tbody> <tr> <td align=left>Assign (Settings) Users/Groups/Apps to Roles</td> <td align=left>CRUD (all roles)</td> <td align=left>CRUD (Proj. Researchers and ML Engineers only)</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Assign Users/Groups/Apps to Organizations</td> <td align=left>R (Projects, Departments)</td> <td align=left>CRUD (Projects only)</td> <td align=left>CRUD (Projects, Departments)</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Departments</td> <td align=left>R</td> <td align=left>R</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Projects</td> <td align=left>R</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Jobs</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>R</td> </tr> <tr> <td align=left>Deployments</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>CRUD</td> <td align=left>R</td> </tr> <tr> <td align=left>Workspaces</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Environments</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Data Sources</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Compute Resources</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Templates</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Clusters</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Node Pools</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Nodes</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Settings (General, Credentials)</td> <td align=left>CRUD</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Events History</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> <tr> <td align=left>Dashboard.Overview</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Dashboards.Analytics</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> <td align=left>R</td> </tr> <tr> <td align=left>Dashboards.Consumption</td> <td align=left>R</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> <td align=left>N/A</td> </tr> </tbody> </table> <p>Permissions: <strong>C</strong> = Create, <strong>R</strong> = Read, <strong>U</strong> = Update, <strong>D</strong> = Delete</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-05-23T15:37:24+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-05-23</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-16T15:01:06+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-16</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+            </style><script src=../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#adding-updating-and-deleting-users class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Users </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../overview-administrator/ class="md-tabs__link md-tabs__link--active"> Administrator </a> </li> <li class=md-tabs__item> <a href=../../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2 checked> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5 checked> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=true> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Users <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Users </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#introduction class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=#working-with-users class=md-nav__link> Working with Users </a> <nav class=md-nav aria-label="Working with Users"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#create-a-user class=md-nav__link> Create a User </a> </li> <li class=md-nav__item> <a href=#roles-and-permissions class=md-nav__link> Roles and permissions </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#introduction class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=#working-with-users class=md-nav__link> Working with Users </a> <nav class=md-nav aria-label="Working with Users"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#create-a-user class=md-nav__link> Create a User </a> </li> <li class=md-nav__item> <a href=#roles-and-permissions class=md-nav__link> Roles and permissions </a> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/admin/admin-ui-setup/admin-ui-users.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id=adding-updating-and-deleting-users>Adding, Updating and Deleting Users<a class=headerlink href=#adding-updating-and-deleting-users title="Permanent link">&para;</a></h1> <h2 id=introduction>Introduction<a class=headerlink href=#introduction title="Permanent link">&para;</a></h2> <p>The Run:ai User Interface allows the creation of Run:ai Users. Run:ai Users can receive varying levels of access to the Administration UI and submit Jobs on the Cluster.</p> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>It is possible to connect the Run:ai user interface to the organization's directory and use single sign-on. This allows you to set Run:ai roles for users and groups from the organizational directory. For further information see <a href=../../runai-setup/authentication/sso/ >single sign-on configuration</a>.</p> </div> <h2 id=working-with-users>Working with Users<a class=headerlink href=#working-with-users title="Permanent link">&para;</a></h2> <p>You can create users, as well as update and delete users.</p> <h3 id=create-a-user>Create a User<a class=headerlink href=#create-a-user title="Permanent link">&para;</a></h3> <div class="admonition note"> <p class=admonition-title>Note</p> <p>To be able to review, add, update and delete users, you must have an <em>Administrator</em> access. If you do not have such access, please contact an Administrator.</p> </div> <p><span class=twemoji><svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M10 22a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h11a2 2 0 0 1 2 2v16a2 2 0 0 1-2 2Zm-.5-2a.5.5 0 0 0 .5.5h11a.5.5 0 0 0 .5-.5V4a.5.5 0 0 0-.5-.5H10a.5.5 0 0 0-.5.5ZM6.17 4.165a.75.75 0 0 1-.335 1.006c-.228.114-.295.177-.315.201a.035.035 0 0 0-.008.016.423.423 0 0 0-.012.112v13c0 .07.008.102.012.112a.03.03 0 0 0 .008.016c.02.024.087.087.315.201a.749.749 0 1 1-.67 1.342c-.272-.136-.58-.315-.81-.598C4.1 19.259 4 18.893 4 18.5v-13c0-.393.1-.759.355-1.073.23-.283.538-.462.81-.598a.75.75 0 0 1 1.006.336ZM2.15 5.624a.75.75 0 0 1-.274 1.025c-.15.087-.257.17-.32.245C1.5 6.96 1.5 6.99 1.5 7v10c0 .01 0 .04.056.106.063.074.17.158.32.245a.75.75 0 0 1-.752 1.298C.73 18.421 0 17.907 0 17V7c0-.907.73-1.42 1.124-1.65a.75.75 0 0 1 1.025.274Z"/></svg></span> Department Admin is available in version 2.10 and later.</p> <ol> <li>Login to the Users area of the Run:ai User interface at <code>company-name.run.ai</code>.</li> <li>Select the <code>Users</code> tab for local users, or the <code>SSO Users</code> tab for SSO users.</li> <li>On the top right, select "NEW USER".</li> <li>Enter the user's email.</li> <li> <p>Select Roles. More than one role can be selected. Available roles are:</p> <ul> <li><strong>Administrator</strong>&mdash;Can manage Users and install Clusters.</li> <li><strong>Editor</strong>&mdash;Can manage Projects and Departments.</li> <li><strong>Viewer</strong>&mdash;View-only access to the Run:ai User Interface.</li> <li><strong>Researcher</strong>&mdash;Can submit ML workloads. Setting a user as a <em>Researcher</em> also requires <a href=../project-setup/#create-a-new-project.md>assigning the user to projects</a>.</li> <li><strong>Research Manager</strong>&mdash;Can act as <em>Researcher</em> in all projects, including new ones to be created in the future.</li> <li><strong>ML Engineer</strong>&mdash;Can view and manage deployments and cluster resources. Available only when <a href=../../workloads/inference-overview/ >Inference module is installed</a>.</li> <li><strong>Department Administrator</strong>&mdash;Can manage Departments, descendent Projects and Workloads.</li> </ul> <p>For more information, <a href=#roles-and-permissions>Roles and permissions</a>.</p> </li> <li> <p>(Optional) Select Cluster(s). This determines what Clusters are accessible to this User.</p> </li> <li>Press "Save".</li> </ol> <p>You will get the new user credentials and have the option to send the credentials by email.</p> <h3 id=roles-and-permissions>Roles and permissions<a class=headerlink href=#roles-and-permissions title="Permanent link">&para;</a></h3> <p>Roles provide a way to group permissions and assign them to either users or user groups. The role identifies the collection of permissions that administrators assign to users or user groups. Permissions define the actions that users can perform on the managed entities. The following table shows the default roles and permissions.</p> <table> <thead> <tr> <th style="text-align: left;">Managed Entity / Roles</th> <th style="text-align: left;">Admin</th> <th style="text-align: left;">Dep. Admin</th> <th style="text-align: left;">Editor</th> <th style="text-align: left;">Research Manager</th> <th style="text-align: left;">Researcher</th> <th style="text-align: left;">ML Eng.</th> <th style="text-align: left;">Viewer</th> </tr> </thead> <tbody> <tr> <td style="text-align: left;">Assign (Settings) Users/Groups/Apps to Roles</td> <td style="text-align: left;">CRUD (all roles)</td> <td style="text-align: left;">CRUD (Proj. Researchers and ML Engineers only)</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Assign Users/Groups/Apps to Organizations</td> <td style="text-align: left;">R (Projects, Departments)</td> <td style="text-align: left;">CRUD (Projects only)</td> <td style="text-align: left;">CRUD (Projects, Departments)</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Departments</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Projects</td> <td style="text-align: left;">R</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Jobs</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Deployments</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Workspaces</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Environments</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Data Sources</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Compute Resources</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Templates</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Clusters</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Node Pools</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Nodes</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Settings (General, Credentials)</td> <td style="text-align: left;">CRUD</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Events History</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> <tr> <td style="text-align: left;">Dashboard.Overview</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Dashboards.Analytics</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> <td style="text-align: left;">R</td> </tr> <tr> <td style="text-align: left;">Dashboards.Consumption</td> <td style="text-align: left;">R</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> <td style="text-align: left;">N/A</td> </tr> </tbody> </table> <p>Permissions: <strong>C</strong> = Create, <strong>R</strong> = Read, <strong>U</strong> = Update, <strong>D</strong> = Delete</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-05-23T15:37:24+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-05-23</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-16T15:01:06+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-16</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/admin/admin-ui-setup/credentials-setup/index.html b/v2.13/admin/admin-ui-setup/credentials-setup/index.html
index da12cf0263..27255de59f 100644
--- a/v2.13/admin/admin-ui-setup/credentials-setup/index.html
+++ b/v2.13/admin/admin-ui-setup/credentials-setup/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/credentials-setup/ rel=canonical><link href=../jobs/ rel=prev><link href=../deployments/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Credentials - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/credentials-setup/ rel=canonical><link href=../jobs/ rel=prev><link href=../deployments/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Credentials - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/dashboard-analysis/index.html b/v2.13/admin/admin-ui-setup/dashboard-analysis/index.html
index e23c78251c..ef57ec4927 100644
--- a/v2.13/admin/admin-ui-setup/dashboard-analysis/index.html
+++ b/v2.13/admin/admin-ui-setup/dashboard-analysis/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/dashboard-analysis/ rel=canonical><link href=../department-setup/ rel=prev><link href=../jobs/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Dashboard Analysis - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/dashboard-analysis/ rel=canonical><link href=../department-setup/ rel=prev><link href=../jobs/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Dashboard Analysis - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/department-setup/index.html b/v2.13/admin/admin-ui-setup/department-setup/index.html
index 26c1dd0e5e..f636cf5647 100644
--- a/v2.13/admin/admin-ui-setup/department-setup/index.html
+++ b/v2.13/admin/admin-ui-setup/department-setup/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/department-setup/ rel=canonical><link href=../project-setup/ rel=prev><link href=../dashboard-analysis/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Departments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/department-setup/ rel=canonical><link href=../project-setup/ rel=prev><link href=../dashboard-analysis/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Departments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/deployments/index.html b/v2.13/admin/admin-ui-setup/deployments/index.html
index d243c92ff1..246a3c2ffb 100644
--- a/v2.13/admin/admin-ui-setup/deployments/index.html
+++ b/v2.13/admin/admin-ui-setup/deployments/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/deployments/ rel=canonical><link href=../credentials-setup/ rel=prev><link href=../../troubleshooting/cluster-health-check/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Deployments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/deployments/ rel=canonical><link href=../credentials-setup/ rel=prev><link href=../../troubleshooting/cluster-health-check/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Deployments - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/jobs/index.html b/v2.13/admin/admin-ui-setup/jobs/index.html
index 68682b4b63..6c78cef450 100644
--- a/v2.13/admin/admin-ui-setup/jobs/index.html
+++ b/v2.13/admin/admin-ui-setup/jobs/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/jobs/ rel=canonical><link href=../dashboard-analysis/ rel=prev><link href=../credentials-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Jobs - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/jobs/ rel=canonical><link href=../dashboard-analysis/ rel=prev><link href=../credentials-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Jobs - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/overview/index.html b/v2.13/admin/admin-ui-setup/overview/index.html
index aa2fee115d..912c403e9b 100644
--- a/v2.13/admin/admin-ui-setup/overview/index.html
+++ b/v2.13/admin/admin-ui-setup/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/overview/ rel=canonical><link href=../../workloads/inference-overview/ rel=prev><link href=../admin-ui-users/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/overview/ rel=canonical><link href=../../workloads/inference-overview/ rel=prev><link href=../admin-ui-users/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/admin-ui-setup/project-setup/index.html b/v2.13/admin/admin-ui-setup/project-setup/index.html
index e281ecf6e6..2f15c0464d 100644
--- a/v2.13/admin/admin-ui-setup/project-setup/index.html
+++ b/v2.13/admin/admin-ui-setup/project-setup/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/project-setup/ rel=canonical><link href=../admin-ui-users/ rel=prev><link href=../department-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/admin-ui-setup/project-setup/ rel=canonical><link href=../admin-ui-users/ rel=prev><link href=../department-setup/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/airflow/index.html b/v2.13/admin/integration/airflow/index.html
index c6011d8170..4d3f331d3e 100644
--- a/v2.13/admin/integration/airflow/index.html
+++ b/v2.13/admin/integration/airflow/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/airflow/ rel=canonical><link href=../jupyterhub/ rel=prev><link href=../mlflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Airflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/airflow/ rel=canonical><link href=../jupyterhub/ rel=prev><link href=../mlflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Airflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/argo-workflows/index.html b/v2.13/admin/integration/argo-workflows/index.html
index 03dc30be16..9afff4c934 100644
--- a/v2.13/admin/integration/argo-workflows/index.html
+++ b/v2.13/admin/integration/argo-workflows/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/argo-workflows/ rel=canonical><link href=../clearml/ rel=prev><link href=../kubevirt/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Argo Workflows - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/argo-workflows/ rel=canonical><link href=../clearml/ rel=prev><link href=../kubevirt/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Argo Workflows - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/clearml/index.html b/v2.13/admin/integration/clearml/index.html
index 408ee5c975..b206947af1 100644
--- a/v2.13/admin/integration/clearml/index.html
+++ b/v2.13/admin/integration/clearml/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/clearml/ rel=canonical><link href=../seldon/ rel=prev><link href=../argo-workflows/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>ClearML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/clearml/ rel=canonical><link href=../seldon/ rel=prev><link href=../argo-workflows/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>ClearML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/comet/index.html b/v2.13/admin/integration/comet/index.html
index e923a3d067..37beeb7080 100644
--- a/v2.13/admin/integration/comet/index.html
+++ b/v2.13/admin/integration/comet/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/comet/ rel=canonical><link href=../deepspeed/ rel=prev><link href=../spark/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Comet - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/comet/ rel=canonical><link href=../deepspeed/ rel=prev><link href=../spark/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Comet - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/deepspeed/index.html b/v2.13/admin/integration/deepspeed/index.html
index 0884b35dc8..c44cc794f6 100644
--- a/v2.13/admin/integration/deepspeed/index.html
+++ b/v2.13/admin/integration/deepspeed/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/deepspeed/ rel=canonical><link href=../messaging/ rel=prev><link href=../comet/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>DeepSpeed - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/deepspeed/ rel=canonical><link href=../messaging/ rel=prev><link href=../comet/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>DeepSpeed - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/jupyterhub/index.html b/v2.13/admin/integration/jupyterhub/index.html
index a72f86c3c9..0ce105808e 100644
--- a/v2.13/admin/integration/jupyterhub/index.html
+++ b/v2.13/admin/integration/jupyterhub/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/jupyterhub/ rel=canonical><link href=../../researcher-setup/docker-to-runai/ rel=prev><link href=../airflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>JupyterHub - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/jupyterhub/ rel=canonical><link href=../../researcher-setup/docker-to-runai/ rel=prev><link href=../airflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>JupyterHub - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/kubeflow/index.html b/v2.13/admin/integration/kubeflow/index.html
index d86697e36c..877ac2ec21 100644
--- a/v2.13/admin/integration/kubeflow/index.html
+++ b/v2.13/admin/integration/kubeflow/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/kubeflow/ rel=canonical><link href=../mlflow/ rel=prev><link href=../seldon/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Kubeflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/kubeflow/ rel=canonical><link href=../mlflow/ rel=prev><link href=../seldon/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Kubeflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/kubevirt/index.html b/v2.13/admin/integration/kubevirt/index.html
index 90ed3dcee7..b0e43ea9b5 100644
--- a/v2.13/admin/integration/kubevirt/index.html
+++ b/v2.13/admin/integration/kubevirt/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/kubevirt/ rel=canonical><link href=../argo-workflows/ rel=prev><link href=../weights-and-biases/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>KubeVirt (VM) - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/kubevirt/ rel=canonical><link href=../argo-workflows/ rel=prev><link href=../weights-and-biases/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>KubeVirt (VM) - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/messaging/index.html b/v2.13/admin/integration/messaging/index.html
index 2e065fa0c9..dc3839bdb4 100644
--- a/v2.13/admin/integration/messaging/index.html
+++ b/v2.13/admin/integration/messaging/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/messaging/ rel=canonical><link href=../weights-and-biases/ rel=prev><link href=../deepspeed/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Event Messaging - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/messaging/ rel=canonical><link href=../weights-and-biases/ rel=prev><link href=../deepspeed/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Event Messaging - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/mlflow/index.html b/v2.13/admin/integration/mlflow/index.html
index cb417970c5..c07e572ba7 100644
--- a/v2.13/admin/integration/mlflow/index.html
+++ b/v2.13/admin/integration/mlflow/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/mlflow/ rel=canonical><link href=../airflow/ rel=prev><link href=../kubeflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>MLflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/mlflow/ rel=canonical><link href=../airflow/ rel=prev><link href=../kubeflow/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>MLflow - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/ray/index.html b/v2.13/admin/integration/ray/index.html
index d179ee600c..b3ce3b81db 100644
--- a/v2.13/admin/integration/ray/index.html
+++ b/v2.13/admin/integration/ray/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/ray/ rel=canonical><link href=../spark/ rel=prev><link href=../../../Researcher/overview-researcher/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Ray - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/ray/ rel=canonical><link href=../spark/ rel=prev><link href=../../../Researcher/overview-researcher/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Ray - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/seldon/index.html b/v2.13/admin/integration/seldon/index.html
index cf7e944bb4..96dce7d3d8 100644
--- a/v2.13/admin/integration/seldon/index.html
+++ b/v2.13/admin/integration/seldon/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/seldon/ rel=canonical><link href=../kubeflow/ rel=prev><link href=../clearml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Seldon Core - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/seldon/ rel=canonical><link href=../kubeflow/ rel=prev><link href=../clearml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Seldon Core - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/integration/spark/index.html b/v2.13/admin/integration/spark/index.html
index 80acefb2df..9a20365c18 100644
--- a/v2.13/admin/integration/spark/index.html
+++ b/v2.13/admin/integration/spark/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/spark/ rel=canonical><link href=../comet/ rel=prev><link href=../ray/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Spark - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/spark/ rel=canonical><link href=../comet/ rel=prev><link href=../ray/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Spark - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -34,7 +34,11 @@
 </code></pre></div> <p>To schedule the executors on GPUs, add the following flags:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-4-1 name=__codelineno-4-1 href=#__codelineno-4-1></a>--conf spark.executor.resource.gpu.amount=1 \
 <a id=__codelineno-4-2 name=__codelineno-4-2 href=#__codelineno-4-2></a>--conf spark.executor.resource.gpu.vendor=nvidia.com \
 <a id=__codelineno-4-3 name=__codelineno-4-3 href=#__codelineno-4-3></a>--conf spark.executor.resource.gpu.discoveryScript=/opt/spark/examples/src/main/scripts/getGpusResources.sh \
-</code></pre></div> <h2 id=see-also>See also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>[1] <a href=https://jaceklaskowski.github.io/spark-kubernetes-book/demo/running-spark-examples-on-minikube/ >Demo: Running Spark Examples on minikube</a></p> <p>[2] <a href=https://spark.apache.org/docs/latest/running-on-kubernetes.html>Running Spark on Kubernetes</a></p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-11T03:54:54+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-11</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-20T09:56:34+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-20</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <p>With GPU fractions add the annotaiton to the executor pods:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-5-1 name=__codelineno-5-1 href=#__codelineno-5-1></a>--conf spark.kubernetes.executor.annotation.gpu-fraction=0.5 \
+<a id=__codelineno-5-2 name=__codelineno-5-2 href=#__codelineno-5-2></a>--conf spark.executor.resource.gpu.amount=1 \
+<a id=__codelineno-5-3 name=__codelineno-5-3 href=#__codelineno-5-3></a>--conf spark.executor.resource.gpu.vendor=nvidia.com \
+<a id=__codelineno-5-4 name=__codelineno-5-4 href=#__codelineno-5-4></a>--conf spark.executor.resource.gpu.discoveryScript=/opt/spark/examples/src/main/scripts/getGpusResources.sh \
+</code></pre></div> <h2 id=see-also>See also<a class=headerlink href=#see-also title="Permanent link">&para;</a></h2> <p>[1] <a href=https://jaceklaskowski.github.io/spark-kubernetes-book/demo/running-spark-examples-on-minikube/ >Demo: Running Spark Examples on minikube</a></p> <p>[2] <a href=https://spark.apache.org/docs/latest/running-on-kubernetes.html>Running Spark on Kubernetes</a></p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-27T08:30:45+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-27</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-20T09:56:34+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-20</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/admin/integration/weights-and-biases/index.html b/v2.13/admin/integration/weights-and-biases/index.html
index 3b23230582..15bb3589a8 100644
--- a/v2.13/admin/integration/weights-and-biases/index.html
+++ b/v2.13/admin/integration/weights-and-biases/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/weights-and-biases/ rel=canonical><link href=../kubevirt/ rel=prev><link href=../messaging/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Weights & Biases - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/integration/weights-and-biases/ rel=canonical><link href=../kubevirt/ rel=prev><link href=../messaging/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Weights & Biases - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/overview-administrator/index.html b/v2.13/admin/overview-administrator/index.html
index 40f9b99fc1..0e99ebcd23 100644
--- a/v2.13/admin/overview-administrator/index.html
+++ b/v2.13/admin/overview-administrator/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/overview-administrator/ rel=canonical><link href=../../home/data-privacy-details/ rel=prev><link href=../runai-setup/installation-types/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/overview-administrator/ rel=canonical><link href=../../home/data-privacy-details/ rel=prev><link href=../runai-setup/installation-types/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/cli-install/index.html b/v2.13/admin/researcher-setup/cli-install/index.html
index 2d6b1299b9..3b2e9d485f 100644
--- a/v2.13/admin/researcher-setup/cli-install/index.html
+++ b/v2.13/admin/researcher-setup/cli-install/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/cli-install/ rel=canonical><link href=../researcher-setup-intro/ rel=prev><link href=../registry-integration/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install the CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/cli-install/ rel=canonical><link href=../researcher-setup-intro/ rel=prev><link href=../registry-integration/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install the CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/cluster-wide-pvc/index.html b/v2.13/admin/researcher-setup/cluster-wide-pvc/index.html
index ec8d4cf529..6e8f47925f 100644
--- a/v2.13/admin/researcher-setup/cluster-wide-pvc/index.html
+++ b/v2.13/admin/researcher-setup/cluster-wide-pvc/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/cluster-wide-pvc/ rel=canonical><link href=../docker-registry-config/ rel=prev><link href=../limit-to-node-group/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Setup cluster wide PVC - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/cluster-wide-pvc/ rel=canonical><link href=../docker-registry-config/ rel=prev><link href=../limit-to-node-group/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Setup cluster wide PVC - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/docker-registry-config/index.html b/v2.13/admin/researcher-setup/docker-registry-config/index.html
index f470e4d9d0..88391d772d 100644
--- a/v2.13/admin/researcher-setup/docker-registry-config/index.html
+++ b/v2.13/admin/researcher-setup/docker-registry-config/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/docker-registry-config/ rel=canonical><link href=../registry-integration/ rel=prev><link href=../cluster-wide-pvc/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Use a Docker Registry with Credentials - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/docker-registry-config/ rel=canonical><link href=../registry-integration/ rel=prev><link href=../cluster-wide-pvc/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Use a Docker Registry with Credentials - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/docker-to-runai/index.html b/v2.13/admin/researcher-setup/docker-to-runai/index.html
index b41fd24804..57591db909 100644
--- a/v2.13/admin/researcher-setup/docker-to-runai/index.html
+++ b/v2.13/admin/researcher-setup/docker-to-runai/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/docker-to-runai/ rel=canonical><link href=../../troubleshooting/diagnostics/ rel=prev><link href=../../integration/jupyterhub/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>From Docker to Run:ai - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/docker-to-runai/ rel=canonical><link href=../../troubleshooting/diagnostics/ rel=prev><link href=../../integration/jupyterhub/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>From Docker to Run:ai - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/limit-to-node-group/index.html b/v2.13/admin/researcher-setup/limit-to-node-group/index.html
index c2a852037f..6d3ca9b10a 100644
--- a/v2.13/admin/researcher-setup/limit-to-node-group/index.html
+++ b/v2.13/admin/researcher-setup/limit-to-node-group/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/limit-to-node-group/ rel=canonical><link href=../cluster-wide-pvc/ rel=prev><link href=../../workloads/workload-overview-admin/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Group Nodes - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/limit-to-node-group/ rel=canonical><link href=../cluster-wide-pvc/ rel=prev><link href=../../workloads/workload-overview-admin/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Group Nodes - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/registry-integration/index.html b/v2.13/admin/researcher-setup/registry-integration/index.html
index 793c366ec7..88db62aae0 100644
--- a/v2.13/admin/researcher-setup/registry-integration/index.html
+++ b/v2.13/admin/researcher-setup/registry-integration/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/registry-integration/ rel=canonical><link href=../cli-install/ rel=prev><link href=../docker-registry-config/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Registry integration - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/registry-integration/ rel=canonical><link href=../cli-install/ rel=prev><link href=../docker-registry-config/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Registry integration - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/researcher-setup/researcher-setup-intro/index.html b/v2.13/admin/researcher-setup/researcher-setup-intro/index.html
index 234a7f150b..4225c0e2d0 100644
--- a/v2.13/admin/researcher-setup/researcher-setup-intro/index.html
+++ b/v2.13/admin/researcher-setup/researcher-setup-intro/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/researcher-setup-intro/ rel=canonical><link href=../../runai-setup/maintenance/audit-log/ rel=prev><link href=../cli-install/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Researcher Setup Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/researcher-setup/researcher-setup-intro/ rel=canonical><link href=../../runai-setup/maintenance/audit-log/ rel=prev><link href=../cli-install/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Researcher Setup Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/authentication/authentication-overview/index.html b/v2.13/admin/runai-setup/authentication/authentication-overview/index.html
index 579ecc9458..db5e3bd3e0 100644
--- a/v2.13/admin/runai-setup/authentication/authentication-overview/index.html
+++ b/v2.13/admin/runai-setup/authentication/authentication-overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/authentication-overview/ rel=canonical><link href=../../config/node-affinity-with-cloud-node-pools/ rel=prev><link href=../researcher-authentication/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/authentication-overview/ rel=canonical><link href=../../config/node-affinity-with-cloud-node-pools/ rel=prev><link href=../researcher-authentication/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/authentication/researcher-authentication/index.html b/v2.13/admin/runai-setup/authentication/researcher-authentication/index.html
index 072a236ae9..9a469bfa61 100644
--- a/v2.13/admin/runai-setup/authentication/researcher-authentication/index.html
+++ b/v2.13/admin/runai-setup/authentication/researcher-authentication/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/researcher-authentication/ rel=canonical><link href=../authentication-overview/ rel=prev><link href=../sso/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Researcher Authentication - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/researcher-authentication/ rel=canonical><link href=../authentication-overview/ rel=prev><link href=../sso/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Researcher Authentication - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/authentication/sso/index.html b/v2.13/admin/runai-setup/authentication/sso/index.html
index e291e76158..761c578070 100644
--- a/v2.13/admin/runai-setup/authentication/sso/index.html
+++ b/v2.13/admin/runai-setup/authentication/sso/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/sso/ rel=canonical><link href=../researcher-authentication/ rel=prev><link href=../../maintenance/node-downtime/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Single Sign-On - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/authentication/sso/ rel=canonical><link href=../researcher-authentication/ rel=prev><link href=../../maintenance/node-downtime/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Single Sign-On - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/cluster-delete/index.html b/v2.13/admin/runai-setup/cluster-setup/cluster-delete/index.html
index 8397766026..a1d06bac68 100644
--- a/v2.13/admin/runai-setup/cluster-setup/cluster-delete/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/cluster-delete/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-delete/ rel=canonical><link href=../cluster-upgrade/ rel=prev><link href=../install-k8s/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Cluster Delete - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-delete/ rel=canonical><link href=../cluster-upgrade/ rel=prev><link href=../install-k8s/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Cluster Delete - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/cluster-install/index.html b/v2.13/admin/runai-setup/cluster-setup/cluster-install/index.html
index 332ffcb9c5..7c43a741a9 100644
--- a/v2.13/admin/runai-setup/cluster-setup/cluster-install/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/cluster-install/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-install/ rel=canonical><link href=../cluster-prerequisites/ rel=prev><link href=../customize-cluster-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Cluster Install - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-install/ rel=canonical><link href=../cluster-prerequisites/ rel=prev><link href=../customize-cluster-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Cluster Install - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html b/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html
index 3d40e36e87..91d815f3ca 100644
--- a/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/ rel=canonical><link href=../cluster-setup-intro/ rel=prev><link href=../cluster-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/ rel=canonical><link href=../cluster-setup-intro/ rel=prev><link href=../cluster-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -13,7 +13,7 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#prerequisites-in-a-nutshell class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Prerequisites </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../overview-administrator/ class="md-tabs__link md-tabs__link--active"> Administrator </a> </li> <li class=md-tabs__item> <a href=../../../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2 checked> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2 checked> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3 checked> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=true> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Prerequisites <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Prerequisites </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#prerequisites-in-a-nutshell class=md-nav__link> Prerequisites in a Nutshell </a> </li> <li class=md-nav__item> <a href=#software-requirements class=md-nav__link> Software Requirements </a> <nav class=md-nav aria-label="Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#operating-system class=md-nav__link> Operating System </a> </li> <li class=md-nav__item> <a href=#kubernetes class=md-nav__link> Kubernetes </a> </li> <li class=md-nav__item> <a href=#nvidia class=md-nav__link> NVIDIA </a> </li> <li class=md-nav__item> <a href=#ingress-controller class=md-nav__link> Ingress Controller </a> </li> <li class=md-nav__item> <a href=#cluster-url class=md-nav__link> Cluster URL </a> </li> <li class=md-nav__item> <a href=#prometheus class=md-nav__link> Prometheus </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#optional-software-requirements class=md-nav__link> Optional Software Requirements </a> <nav class=md-nav aria-label="Optional Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed-training class=md-nav__link> Distributed Training </a> </li> <li class=md-nav__item> <a href=#inference class=md-nav__link> Inference </a> <nav class=md-nav aria-label=Inference> <ul class=md-nav__list> <li class=md-nav__item> <a href=#inference-autoscaling class=md-nav__link> Inference Autoscaling </a> </li> <li class=md-nav__item> <a href=#accessing-inference-from-outside-the-cluster class=md-nav__link> Accessing Inference from outside the Cluster </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#hardware-requirements class=md-nav__link> Hardware Requirements </a> </li> <li class=md-nav__item> <a href=#user-requirements class=md-nav__link> User requirements </a> </li> <li class=md-nav__item> <a href=#network-access-requirements class=md-nav__link> Network Access Requirements </a> <nav class=md-nav aria-label="Network Access Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#during-installation class=md-nav__link> During Installation </a> </li> <li class=md-nav__item> <a href=#post-installation class=md-nav__link> Post Installation </a> </li> <li class=md-nav__item> <a href=#network-proxy class=md-nav__link> Network Proxy </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#pre-install-script class=md-nav__link> Pre-install Script </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#prerequisites-in-a-nutshell class=md-nav__link> Prerequisites in a Nutshell </a> </li> <li class=md-nav__item> <a href=#software-requirements class=md-nav__link> Software Requirements </a> <nav class=md-nav aria-label="Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#operating-system class=md-nav__link> Operating System </a> </li> <li class=md-nav__item> <a href=#kubernetes class=md-nav__link> Kubernetes </a> </li> <li class=md-nav__item> <a href=#nvidia class=md-nav__link> NVIDIA </a> </li> <li class=md-nav__item> <a href=#ingress-controller class=md-nav__link> Ingress Controller </a> </li> <li class=md-nav__item> <a href=#cluster-url class=md-nav__link> Cluster URL </a> </li> <li class=md-nav__item> <a href=#prometheus class=md-nav__link> Prometheus </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#optional-software-requirements class=md-nav__link> Optional Software Requirements </a> <nav class=md-nav aria-label="Optional Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed-training class=md-nav__link> Distributed Training </a> </li> <li class=md-nav__item> <a href=#inference class=md-nav__link> Inference </a> <nav class=md-nav aria-label=Inference> <ul class=md-nav__list> <li class=md-nav__item> <a href=#inference-autoscaling class=md-nav__link> Inference Autoscaling </a> </li> <li class=md-nav__item> <a href=#accessing-inference-from-outside-the-cluster class=md-nav__link> Accessing Inference from outside the Cluster </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#hardware-requirements class=md-nav__link> Hardware Requirements </a> </li> <li class=md-nav__item> <a href=#user-requirements class=md-nav__link> User requirements </a> </li> <li class=md-nav__item> <a href=#network-access-requirements class=md-nav__link> Network Access Requirements </a> <nav class=md-nav aria-label="Network Access Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#during-installation class=md-nav__link> During Installation </a> </li> <li class=md-nav__item> <a href=#post-installation class=md-nav__link> Post Installation </a> </li> <li class=md-nav__item> <a href=#network-proxy class=md-nav__link> Network Proxy </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#pre-install-script class=md-nav__link> Pre-install Script </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/admin/runai-setup/cluster-setup/cluster-prerequisites.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>Prerequisites</h1> <p>Below are the prerequisites of a cluster installed with Run:ai. </p> <h2 id=prerequisites-in-a-nutshell>Prerequisites in a Nutshell<a class=headerlink href=#prerequisites-in-a-nutshell title="Permanent link">&para;</a></h2> <p>The following is a checklist of the Run:ai prerequisites:</p> <table> <thead> <tr> <th>Prerequisite</th> <th>Details</th> </tr> </thead> <tbody> <tr> <td><a href=#kubernetes>Kubernetes</a></td> <td>Verify certified vendor and correct version.</td> </tr> <tr> <td><a href=#nvidia>NVIDIA GPU Operator</a></td> <td>Different Kubernetes flavors have slightly different setup instructions. <br> Verify correct version.</td> </tr> <tr> <td><a href=#ingress-controller>Ingress Controller</a></td> <td>Install and configure NGINX (some Kubernetes flavors have NGINX pre-installed).</td> </tr> <tr> <td><a href=#prometheus>Prometheus</a></td> <td>Install Prometheus.</td> </tr> <tr> <td><a href=#cluster-url>Trusted domain name</a></td> <td>You must provide a trusted domain name. Accessible only inside the organization</td> </tr> <tr> <td>(Optional) <a href=#distributed-training>Distributed Training</a></td> <td>Install Kubeflow Training Operator if required.</td> </tr> <tr> <td>(Optional) <a href=#inference>Inference</a></td> <td>Some third party software needs to be installed to use the Run:ai inference module.</td> </tr> </tbody> </table> <p>There are also specific <a href=#hardware-requirements>hardware</a>, <a href=#operating-system>operating system</a> and <a href=#network-access-requirements>network access</a> requirements. A <a href=#pre-install-script>pre-install</a> script is available to test if the prerequisites are met before installation. </p> <h2 id=software-requirements>Software Requirements<a class=headerlink href=#software-requirements title="Permanent link">&para;</a></h2> <h3 id=operating-system>Operating System<a class=headerlink href=#operating-system title="Permanent link">&para;</a></h3> <ul> <li>Run:ai will work on any <strong>Linux</strong> operating system that is supported by <strong>both</strong> Kubernetes and <a href=https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html target=_blank>NVIDIA</a>. </li> <li>An important highlight is that GKE (Google Kubernetes Engine) will only work with Ubuntu, as NVIDIA <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/google-gke.html#about-using-the-operator-with-google-gke>does not support</a> the default <em>Container-Optimized OS with Containerd</em> image.</li> <li>Run:ai performs its internal tests on Ubuntu 20.04 and CoreOS for OpenShift. </li> </ul> <h3 id=kubernetes>Kubernetes<a class=headerlink href=#kubernetes title="Permanent link">&para;</a></h3> <p>Run:ai requires Kubernetes. Run:ai is been certified with the following Kubernetes distributions: </p> <table> <thead> <tr> <th>Kubernetes Distribution</th> <th>Description</th> <th>Installation Notes</th> </tr> </thead> <tbody> <tr> <td>Vanilla Kubernetes</td> <td>Using no specific distribution but rather k8s native installation</td> <td>See instructions for a simple (non-production-ready) <a href=../install-k8s/ >Kubernetes Installation</a> script.</td> </tr> <tr> <td>OCP</td> <td>OpenShift Container Platform</td> <td>The Run:ai operator is <a href=https://catalog.redhat.com/software/operators/detail/60be3acc3308418324b5e9d8 target=_blank>certified</a> for OpenShift by Red Hat.</td> </tr> <tr> <td>EKS</td> <td>Amazon Elastic Kubernetes Service</td> <td></td> </tr> <tr> <td>AKS</td> <td>Azure Kubernetes Services</td> <td></td> </tr> <tr> <td>GKE</td> <td>Google Kubernetes Engine</td> <td></td> </tr> <tr> <td>RKE</td> <td>Rancher Kubernetes Engine</td> <td>When installing Run:ai, select <em>On Premise</em>. RKE2 has a defect which requires a specific installation flow. Please contact Run:ai customer support for additional details.</td> </tr> <tr> <td>Bright</td> <td><a href=https://www.nvidia.com/en-us/data-center/bright-cluster-manager/ target=_blank>NVIDIA Bright Cluster Manager</a></td> <td>In addition, NVIDIA DGX comes <a href=../dgx-bundle/ >bundled</a> with Run:ai</td> </tr> </tbody> </table> <p>Run:ai has been tested with the following Kubernetes distributions. Please contact Run:ai Customer Support for up to date certification details: </p> <table> <thead> <tr> <th>Kubernetes Distribution</th> <th>Description</th> <th>Installation Notes</th> </tr> </thead> <tbody> <tr> <td>Ezmeral</td> <td>HPE Ezmeral Container Platform</td> <td>See Run:ai at <a href=https://www.hpe.com/us/en/software/marketplace/runai.html target=_blank>Ezmeral marketplace</a></td> </tr> <tr> <td>Tanzu</td> <td>VMWare Kubernetes</td> <td>Tanzu supports <em>containerd</em> rather than <em>docker</em>. See the NVIDIA prerequisites below as well as <a href=../customize-cluster-install/ >cluster customization</a> for changes required for containerd</td> </tr> </tbody> </table> <p>Following is a Kubernetes support matrix for the latest Run:ai releases:</p> <table> <thead> <tr> <th>Run:ai version</th> <th>Supported Kubernetes versions</th> <th>Supported OpenShift versions</th> </tr> </thead> <tbody> <tr> <td>Run:ai 2.9</td> <td>1.21 through 1.26</td> <td>4.8 through 4.11</td> </tr> <tr> <td>Run:ai 2.10</td> <td>1.21 through 1.26 (see note below)</td> <td>4.8 through 4.11</td> </tr> <tr> <td>Run:ai 2.12</td> <td>1.23 through 1.27 (see note below)</td> <td>4.10 through 4.12</td> </tr> <tr> <td>Run:ai 2.13</td> <td>1.23 through 1.27 (see note below)</td> <td>4.10 through 4.12</td> </tr> </tbody> </table> <div class="admonition note"> <p class=admonition-title>Note</p> <p>Run:ai allows scheduling of Jobs with PVCs. See for example the command-line interface flag <a href=../../../Researcher/cli-reference/runai-submit/#-pvc-new-string>--pvc-new</a>. A Job scheduled with a PVC based on a specific type of storage class (a storage class with the property <code>volumeBindingMode</code> equals to <code>WaitForFirstConsumer</code>) will <a href=https://kubernetes.io/docs/concepts/storage/storage-capacity/ target=_blank>not work</a> on Kubernetes 1.23 or lower.</p> </div> <p>For an up-to-date end-of-life statement of Kubernetes see <a href=https://kubernetes.io/releases/ target=_blank>Kubernetes Release History</a>.</p> <p>Run:ai does not support <a href=https://kubernetes.io/docs/concepts/security/pod-security-admission/ target=_blank>Pod Security Admission</a>. Support for <a href=https://kubernetes.io/docs/concepts/policy/pod-security-policy/ target=_blank>Pod Security Policy</a> has been removed with Run:ai 2.9.</p> <h3 id=nvidia>NVIDIA<a class=headerlink href=#nvidia title="Permanent link">&para;</a></h3> <p>Run:ai has been certified on <strong>NVIDIA GPU Operator</strong> 22.9 to 23.3. Older versions (1.10 and 1.11) have a documented <a href=https://github.com/NVIDIA/gpu-feature-discovery/issues/26 target=_blank>NVIDIA issue</a>. Follow the <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator target=blank>Getting Started guide</a> to install the NVIDIA GPU Operator, or see the distribution-specific instructions below:</p> <div class="tabbed-set tabbed-alternate" data-tabs=1:3><input checked=checked id=__tabbed_1_1 name=__tabbed_1 type=radio><input id=__tabbed_1_2 name=__tabbed_1 type=radio><input id=__tabbed_1_3 name=__tabbed_1 type=radio><div class=tabbed-labels><label for=__tabbed_1_1>EKS</label><label for=__tabbed_1_2>GKE</label><label for=__tabbed_1_3>RKE2</label></div> <div class=tabbed-content> <div class=tabbed-block> <ul> <li>When setting up EKS, do not install the NVIDIA device plug-in (as we want the NVIDIA GPU Operator to install it instead). When using the <a href=https://eksctl.io/ target=_blank>eksctl</a> tool to create an AWS EKS cluster, use the flag <code>--install-nvidia-plugin=false</code> to disable this install.</li> <li>Follow the <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator target=blank>Getting Started guide</a> to install the NVIDIA GPU Operator. For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: <code>--set driver.enabled=false</code>. </li> </ul> </div> <div class=tabbed-block> <p>Create the <code>gpu-operator</code> namespace by running</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>kubectl<span class=w> </span>create<span class=w> </span>ns<span class=w> </span>gpu-operator
+            </style><script src=../../../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#prerequisites-in-a-nutshell class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Prerequisites </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../../../.. class=md-tabs__link> Home </a> </li> <li class=md-tabs__item> <a href=../../../overview-administrator/ class="md-tabs__link md-tabs__link--active"> Administrator </a> </li> <li class=md-tabs__item> <a href=../../../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=false> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../home/components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../home/whats-new-2-13/ class=md-nav__link> Version 2.13 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../../../../home/whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../home/data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2 checked> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2 checked> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=true> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3 checked> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=true> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Prerequisites <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Prerequisites </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#prerequisites-in-a-nutshell class=md-nav__link> Prerequisites in a Nutshell </a> </li> <li class=md-nav__item> <a href=#software-requirements class=md-nav__link> Software Requirements </a> <nav class=md-nav aria-label="Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#operating-system class=md-nav__link> Operating System </a> </li> <li class=md-nav__item> <a href=#kubernetes class=md-nav__link> Kubernetes </a> </li> <li class=md-nav__item> <a href=#nvidia class=md-nav__link> NVIDIA </a> </li> <li class=md-nav__item> <a href=#ingress-controller class=md-nav__link> Ingress Controller </a> </li> <li class=md-nav__item> <a href=#cluster-url class=md-nav__link> Cluster URL </a> </li> <li class=md-nav__item> <a href=#prometheus class=md-nav__link> Prometheus </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#optional-software-requirements class=md-nav__link> Optional Software Requirements </a> <nav class=md-nav aria-label="Optional Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed-training class=md-nav__link> Distributed Training </a> </li> <li class=md-nav__item> <a href=#inference class=md-nav__link> Inference </a> <nav class=md-nav aria-label=Inference> <ul class=md-nav__list> <li class=md-nav__item> <a href=#inference-autoscaling class=md-nav__link> Inference Autoscaling </a> </li> <li class=md-nav__item> <a href=#accessing-inference-from-outside-the-cluster class=md-nav__link> Accessing Inference from outside the Cluster </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#hardware-requirements class=md-nav__link> Hardware Requirements </a> </li> <li class=md-nav__item> <a href=#user-requirements class=md-nav__link> User requirements </a> </li> <li class=md-nav__item> <a href=#network-access-requirements class=md-nav__link> Network Access Requirements </a> <nav class=md-nav aria-label="Network Access Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#during-installation class=md-nav__link> During Installation </a> </li> <li class=md-nav__item> <a href=#post-installation class=md-nav__link> Post Installation </a> </li> <li class=md-nav__item> <a href=#network-proxy class=md-nav__link> Network Proxy </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#pre-install-script class=md-nav__link> Pre-install Script </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../../researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../../workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../../workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../../admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../../integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../../integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../../integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../../integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../../integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../../integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../../integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../../integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../../integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../../integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../../integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../../integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../../integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#prerequisites-in-a-nutshell class=md-nav__link> Prerequisites in a Nutshell </a> </li> <li class=md-nav__item> <a href=#software-requirements class=md-nav__link> Software Requirements </a> <nav class=md-nav aria-label="Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#operating-system class=md-nav__link> Operating System </a> </li> <li class=md-nav__item> <a href=#kubernetes class=md-nav__link> Kubernetes </a> </li> <li class=md-nav__item> <a href=#nvidia class=md-nav__link> NVIDIA </a> </li> <li class=md-nav__item> <a href=#ingress-controller class=md-nav__link> Ingress Controller </a> </li> <li class=md-nav__item> <a href=#cluster-url class=md-nav__link> Cluster URL </a> </li> <li class=md-nav__item> <a href=#prometheus class=md-nav__link> Prometheus </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#optional-software-requirements class=md-nav__link> Optional Software Requirements </a> <nav class=md-nav aria-label="Optional Software Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#distributed-training class=md-nav__link> Distributed Training </a> </li> <li class=md-nav__item> <a href=#inference class=md-nav__link> Inference </a> <nav class=md-nav aria-label=Inference> <ul class=md-nav__list> <li class=md-nav__item> <a href=#inference-autoscaling class=md-nav__link> Inference Autoscaling </a> </li> <li class=md-nav__item> <a href=#accessing-inference-from-outside-the-cluster class=md-nav__link> Accessing Inference from outside the Cluster </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#hardware-requirements class=md-nav__link> Hardware Requirements </a> </li> <li class=md-nav__item> <a href=#user-requirements class=md-nav__link> User requirements </a> </li> <li class=md-nav__item> <a href=#network-access-requirements class=md-nav__link> Network Access Requirements </a> <nav class=md-nav aria-label="Network Access Requirements"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#during-installation class=md-nav__link> During Installation </a> </li> <li class=md-nav__item> <a href=#post-installation class=md-nav__link> Post Installation </a> </li> <li class=md-nav__item> <a href=#network-proxy class=md-nav__link> Network Proxy </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#pre-install-script class=md-nav__link> Pre-install Script </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/admin/runai-setup/cluster-setup/cluster-prerequisites.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1>Prerequisites</h1> <p>Below are the prerequisites of a cluster installed with Run:ai. </p> <h2 id=prerequisites-in-a-nutshell>Prerequisites in a Nutshell<a class=headerlink href=#prerequisites-in-a-nutshell title="Permanent link">&para;</a></h2> <p>The following is a checklist of the Run:ai prerequisites:</p> <table> <thead> <tr> <th>Prerequisite</th> <th>Details</th> </tr> </thead> <tbody> <tr> <td><a href=#kubernetes>Kubernetes</a></td> <td>Verify certified vendor and correct version.</td> </tr> <tr> <td><a href=#nvidia>NVIDIA GPU Operator</a></td> <td>Different Kubernetes flavors have slightly different setup instructions. <br> Verify correct version.</td> </tr> <tr> <td><a href=#ingress-controller>Ingress Controller</a></td> <td>Install and configure NGINX (some Kubernetes flavors have NGINX pre-installed).</td> </tr> <tr> <td><a href=#prometheus>Prometheus</a></td> <td>Install Prometheus.</td> </tr> <tr> <td><a href=#cluster-url>Trusted domain name</a></td> <td>You must provide a trusted domain name. Accessible only inside the organization</td> </tr> <tr> <td>(Optional) <a href=#distributed-training>Distributed Training</a></td> <td>Install Kubeflow Training Operator if required.</td> </tr> <tr> <td>(Optional) <a href=#inference>Inference</a></td> <td>Some third party software needs to be installed to use the Run:ai inference module.</td> </tr> </tbody> </table> <p>There are also specific <a href=#hardware-requirements>hardware</a>, <a href=#operating-system>operating system</a> and <a href=#network-access-requirements>network access</a> requirements. A <a href=#pre-install-script>pre-install</a> script is available to test if the prerequisites are met before installation. </p> <h2 id=software-requirements>Software Requirements<a class=headerlink href=#software-requirements title="Permanent link">&para;</a></h2> <h3 id=operating-system>Operating System<a class=headerlink href=#operating-system title="Permanent link">&para;</a></h3> <ul> <li>Run:ai will work on any <strong>Linux</strong> operating system that is supported by <strong>both</strong> Kubernetes and <a href=https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html target=_blank>NVIDIA</a>. </li> <li>An important highlight is that GKE (Google Kubernetes Engine) will only work with Ubuntu, as NVIDIA <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/google-gke.html#about-using-the-operator-with-google-gke>does not support</a> the default <em>Container-Optimized OS with Containerd</em> image.</li> <li>Run:ai performs its internal tests on Ubuntu 20.04 and CoreOS for OpenShift. </li> </ul> <h3 id=kubernetes>Kubernetes<a class=headerlink href=#kubernetes title="Permanent link">&para;</a></h3> <p>Run:ai requires Kubernetes. Run:ai is been certified with the following Kubernetes distributions: </p> <table> <thead> <tr> <th>Kubernetes Distribution</th> <th>Description</th> <th>Installation Notes</th> </tr> </thead> <tbody> <tr> <td>Vanilla Kubernetes</td> <td>Using no specific distribution but rather k8s native installation</td> <td>See instructions for a simple (non-production-ready) <a href=../install-k8s/ >Kubernetes Installation</a> script.</td> </tr> <tr> <td>OCP</td> <td>OpenShift Container Platform</td> <td>The Run:ai operator is <a href=https://catalog.redhat.com/software/operators/detail/60be3acc3308418324b5e9d8 target=_blank>certified</a> for OpenShift by Red Hat.</td> </tr> <tr> <td>EKS</td> <td>Amazon Elastic Kubernetes Service</td> <td></td> </tr> <tr> <td>AKS</td> <td>Azure Kubernetes Services</td> <td></td> </tr> <tr> <td>GKE</td> <td>Google Kubernetes Engine</td> <td></td> </tr> <tr> <td>RKE</td> <td>Rancher Kubernetes Engine</td> <td>When installing Run:ai, select <em>On Premise</em>. RKE2 has a defect which requires a specific installation flow. Please contact Run:ai customer support for additional details.</td> </tr> <tr> <td>Bright</td> <td><a href=https://www.nvidia.com/en-us/data-center/bright-cluster-manager/ target=_blank>NVIDIA Bright Cluster Manager</a></td> <td>In addition, NVIDIA DGX comes <a href=../dgx-bundle/ >bundled</a> with Run:ai</td> </tr> </tbody> </table> <p>Run:ai has been tested with the following Kubernetes distributions. Please contact Run:ai Customer Support for up to date certification details: </p> <table> <thead> <tr> <th>Kubernetes Distribution</th> <th>Description</th> <th>Installation Notes</th> </tr> </thead> <tbody> <tr> <td>Ezmeral</td> <td>HPE Ezmeral Container Platform</td> <td>See Run:ai at <a href=https://www.hpe.com/us/en/software/marketplace/runai.html target=_blank>Ezmeral marketplace</a></td> </tr> <tr> <td>Tanzu</td> <td>VMWare Kubernetes</td> <td>Tanzu supports <em>containerd</em> rather than <em>docker</em>. See the NVIDIA prerequisites below as well as <a href=../customize-cluster-install/ >cluster customization</a> for changes required for containerd</td> </tr> </tbody> </table> <p>Following is a Kubernetes support matrix for the latest Run:ai releases:</p> <table> <thead> <tr> <th>Run:ai version</th> <th>Supported Kubernetes versions</th> <th>Supported OpenShift versions</th> </tr> </thead> <tbody> <tr> <td>Run:ai 2.9</td> <td>1.21 through 1.26</td> <td>4.8 through 4.11</td> </tr> <tr> <td>Run:ai 2.10</td> <td>1.21 through 1.26 (see note below)</td> <td>4.8 through 4.11</td> </tr> <tr> <td>Run:ai 2.12</td> <td>1.23 through 1.27 (see note below)</td> <td>4.10 through 4.12</td> </tr> <tr> <td>Run:ai 2.13</td> <td>1.23 through 1.27 (see note below)</td> <td>4.10 through 4.12</td> </tr> </tbody> </table> <div class="admonition note"> <p class=admonition-title>Note</p> <p>Run:ai allows scheduling of Jobs with PVCs. See for example the command-line interface flag <a href=../../../Researcher/cli-reference/runai-submit/#-pvc-new-string>--pvc-new</a>. A Job scheduled with a PVC based on a specific type of storage class (a storage class with the property <code>volumeBindingMode</code> equals to <code>WaitForFirstConsumer</code>) will <a href=https://kubernetes.io/docs/concepts/storage/storage-capacity/ target=_blank>not work</a> on Kubernetes 1.23 or lower.</p> </div> <p>For an up-to-date end-of-life statement of Kubernetes see <a href=https://kubernetes.io/releases/ target=_blank>Kubernetes Release History</a>.</p> <p>Run:ai does not support <a href=https://kubernetes.io/docs/concepts/security/pod-security-admission/ target=_blank>Pod Security Admission</a>. Support for <a href=https://kubernetes.io/docs/concepts/policy/pod-security-policy/ target=_blank>Pod Security Policy</a> has been removed with Run:ai 2.9.</p> <h3 id=nvidia>NVIDIA<a class=headerlink href=#nvidia title="Permanent link">&para;</a></h3> <p>Run:ai has been certified on <strong>NVIDIA GPU Operator</strong> 22.9 to 23.3. Older versions (1.10 and 1.11) have a documented <a href=https://github.com/NVIDIA/gpu-feature-discovery/issues/26 target=_blank>NVIDIA issue</a>. </p> <p>Follow the <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator target=blank>Getting Started guide</a> to install the NVIDIA GPU Operator, or see the distribution-specific instructions below:</p> <div class="tabbed-set tabbed-alternate" data-tabs=1:3><input checked=checked id=__tabbed_1_1 name=__tabbed_1 type=radio><input id=__tabbed_1_2 name=__tabbed_1 type=radio><input id=__tabbed_1_3 name=__tabbed_1 type=radio><div class=tabbed-labels><label for=__tabbed_1_1>EKS</label><label for=__tabbed_1_2>GKE</label><label for=__tabbed_1_3>RKE2</label></div> <div class=tabbed-content> <div class=tabbed-block> <ul> <li>When setting up EKS, do not install the NVIDIA device plug-in (as we want the NVIDIA GPU Operator to install it instead). When using the <a href=https://eksctl.io/ target=_blank>eksctl</a> tool to create an AWS EKS cluster, use the flag <code>--install-nvidia-plugin=false</code> to disable this install.</li> <li>Follow the <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#install-nvidia-gpu-operator target=blank>Getting Started guide</a> to install the NVIDIA GPU Operator. For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: <code>--set driver.enabled=false</code>. </li> </ul> </div> <div class=tabbed-block> <p>Create the <code>gpu-operator</code> namespace by running</p> <div class=highlight><pre><span></span><code><a id=__codelineno-0-1 name=__codelineno-0-1 href=#__codelineno-0-1></a>kubectl<span class=w> </span>create<span class=w> </span>ns<span class=w> </span>gpu-operator
 </code></pre></div> <p>Before installing the GPU Operator you must create the following file:</p> <div class=highlight><span class=filename>resourcequota.yaml</span><pre><span></span><code><a id=__codelineno-1-1 name=__codelineno-1-1 href=#__codelineno-1-1></a><span class=nt>apiVersion</span><span class=p>:</span><span class=w> </span><span class="l l-Scalar l-Scalar-Plain">v1</span>
 <a id=__codelineno-1-2 name=__codelineno-1-2 href=#__codelineno-1-2></a><span class=nt>kind</span><span class=p>:</span><span class=w> </span><span class="l l-Scalar l-Scalar-Plain">ResourceQuota</span>
 <a id=__codelineno-1-3 name=__codelineno-1-3 href=#__codelineno-1-3></a><span class=nt>metadata</span><span class=p>:</span>
@@ -27,7 +27,7 @@
 <a id=__codelineno-1-11 name=__codelineno-1-11 href=#__codelineno-1-11></a><span class=w>      </span><span class=nt>values</span><span class=p>:</span>
 <a id=__codelineno-1-12 name=__codelineno-1-12 href=#__codelineno-1-12></a><span class=w>      </span><span class="p p-Indicator">-</span><span class=w> </span><span class="l l-Scalar l-Scalar-Plain">system-node-critical</span>
 <a id=__codelineno-1-13 name=__codelineno-1-13 href=#__codelineno-1-13></a><span class=w>      </span><span class="p p-Indicator">-</span><span class=w> </span><span class="l l-Scalar l-Scalar-Plain">system-cluster-critical</span>
-</code></pre></div> <p>Then run: <code>kubectl apply -f resourcequota.yaml</code></p> <div class="admonition important"> <p class=admonition-title>Important</p> <ul> <li>Run:ai on GKE has only been tested with GPU Operator version 22.9 and up.</li> <li>The above only works for Run:ai 2.7.16 and above. </li> </ul> </div> </div> <div class=tabbed-block> <p>Install the NVIDIA GPU Operator as discussed <a href=https://thenewstack.io/install-a-nvidia-gpu-operator-on-rke2-kubernetes-cluster/ target=_blank>here</a>.</p> </div> </div> </div> <div class="admonition notes"> <p class=admonition-title>Notes</p> <ul> <li>Use the default namespace <code>gpu-operator</code>. Otherwise, you must specify the target namespace using the flag <code>runai-operator.config.nvidiaDcgmExporter.namespace</code> as described in <a href=../customize-cluster-install/ >customized cluster installation</a>.</li> <li>NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags <code>--set driver.enabled=false</code>. <a href=https://docs.nvidia.com/dgx/index.html target=_blank>DGX OS</a> is one such example as it comes bundled with NVIDIA Drivers. </li> <li>To work with <em>containerd</em> (e.g. for Tanzu), use the <a href=https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#chart-customization-options target=_blank>defaultRuntime</a> flag accordingly.</li> <li>To use <a href=../../../../Researcher/scheduling/fractions/#dynamic-mig>Dynamic MIG</a>, the GPU Operator must be installed with the flag <code>mig.strategy=mixed</code>. If the GPU Operator is already installed, edit the clusterPolicy by running <code>kubectl patch clusterPolicy cluster-policy -n gpu-operator --type=merge -p '{"spec":{"mig":{"strategy": "mixed"}}}</code></li> </ul> </div> <h3 id=ingress-controller>Ingress Controller<a class=headerlink href=#ingress-controller title="Permanent link">&para;</a></h3> <p>Run:ai requires an ingress controller as a prerequisite. The Run:ai cluster installation configures one or more ingress objects on top of the controller. </p> <p>There are many ways to install and configure an ingress controller and configuration is environment-dependent. A simple solution is to install &amp; configure <em>NGINX</em>:</p> <div class="tabbed-set tabbed-alternate" data-tabs=2:3><input checked=checked id=__tabbed_2_1 name=__tabbed_2 type=radio><input id=__tabbed_2_2 name=__tabbed_2 type=radio><input id=__tabbed_2_3 name=__tabbed_2 type=radio><div class=tabbed-labels><label for=__tabbed_2_1>On Prem</label><label for=__tabbed_2_2>RKE</label><label for=__tabbed_2_3>Managed Kubernetes</label></div> <div class=tabbed-content> <div class=tabbed-block> <div class=highlight><pre><span></span><code><a id=__codelineno-2-1 name=__codelineno-2-1 href=#__codelineno-2-1></a>helm<span class=w> </span>repo<span class=w> </span>add<span class=w> </span>ingress-nginx<span class=w> </span>https://kubernetes.github.io/ingress-nginx
+</code></pre></div> <p>Then run: <code>kubectl apply -f resourcequota.yaml</code></p> <div class="admonition important"> <p class=admonition-title>Important</p> <ul> <li>Run:ai on GKE has only been tested with GPU Operator version 22.9 and up.</li> <li>The above only works for Run:ai 2.7.16 and above. </li> </ul> </div> </div> <div class=tabbed-block> <p>Install the NVIDIA GPU Operator as discussed <a href=https://thenewstack.io/install-a-nvidia-gpu-operator-on-rke2-kubernetes-cluster/ target=_blank>here</a>.</p> </div> </div> </div> <div class="admonition notes"> <p class=admonition-title>Notes</p> <ul> <li>Use the default namespace <code>gpu-operator</code>. Otherwise, you must specify the target namespace using the flag <code>runai-operator.config.nvidiaDcgmExporter.namespace</code> as described in <a href=../customize-cluster-install/ >customized cluster installation</a>.</li> <li>NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags <code>--set driver.enabled=false</code>. <a href=https://docs.nvidia.com/dgx/index.html target=_blank>DGX OS</a> is one such example as it comes bundled with NVIDIA Drivers. <!-- * To work with _containerd_ (e.g. for Tanzu), use the <a href="https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html#chart-customization-options">defaultRuntime</a>{target=_blank} flag accordingly. --></li> <li>To use <a href=../../../../Researcher/scheduling/fractions/#dynamic-mig>Dynamic MIG</a>, the GPU Operator must be installed with the flag <code>mig.strategy=mixed</code>. If the GPU Operator is already installed, edit the clusterPolicy by running <code>kubectl patch clusterPolicy cluster-policy -n gpu-operator --type=merge -p '{"spec":{"mig":{"strategy": "mixed"}}}</code></li> </ul> </div> <h3 id=ingress-controller>Ingress Controller<a class=headerlink href=#ingress-controller title="Permanent link">&para;</a></h3> <p>Run:ai requires an ingress controller as a prerequisite. The Run:ai cluster installation configures one or more ingress objects on top of the controller. </p> <p>There are many ways to install and configure an ingress controller and configuration is environment-dependent. A simple solution is to install &amp; configure <em>NGINX</em>:</p> <div class="tabbed-set tabbed-alternate" data-tabs=2:3><input checked=checked id=__tabbed_2_1 name=__tabbed_2 type=radio><input id=__tabbed_2_2 name=__tabbed_2 type=radio><input id=__tabbed_2_3 name=__tabbed_2 type=radio><div class=tabbed-labels><label for=__tabbed_2_1>On Prem</label><label for=__tabbed_2_2>RKE</label><label for=__tabbed_2_3>Managed Kubernetes</label></div> <div class=tabbed-content> <div class=tabbed-block> <div class=highlight><pre><span></span><code><a id=__codelineno-2-1 name=__codelineno-2-1 href=#__codelineno-2-1></a>helm<span class=w> </span>repo<span class=w> </span>add<span class=w> </span>ingress-nginx<span class=w> </span>https://kubernetes.github.io/ingress-nginx
 <a id=__codelineno-2-2 name=__codelineno-2-2 href=#__codelineno-2-2></a>helm<span class=w> </span>repo<span class=w> </span>update
 <a id=__codelineno-2-3 name=__codelineno-2-3 href=#__codelineno-2-3></a>helm<span class=w> </span>upgrade<span class=w> </span>-i<span class=w> </span>nginx-ingress<span class=w> </span>ingress-nginx/ingress-nginx<span class=w>   </span><span class=se>\</span>
 <a id=__codelineno-2-4 name=__codelineno-2-4 href=#__codelineno-2-4></a><span class=w>    </span>--namespace<span class=w> </span>nginx-ingress<span class=w> </span>--create-namespace<span class=w> </span><span class=se>\</span>
@@ -41,20 +41,20 @@
 <a id=__codelineno-4-2 name=__codelineno-4-2 href=#__codelineno-4-2></a>kubectl<span class=w> </span>create<span class=w> </span>secret<span class=w> </span>tls<span class=w> </span>runai-cluster-domain-tls-secret<span class=w> </span>-n<span class=w> </span>runai<span class=w> </span><span class=se>\</span>
 <a id=__codelineno-4-3 name=__codelineno-4-3 href=#__codelineno-4-3></a><span class=w>    </span>--cert<span class=w> </span>/path/to/fullchain.pem<span class=w>  </span><span class=se>\ </span><span class=c1># (1)</span>
 <a id=__codelineno-4-4 name=__codelineno-4-4 href=#__codelineno-4-4></a><span class=w>    </span>--key<span class=w> </span>/path/to/private.pem<span class=w> </span><span class=c1># (2)</span>
-</code></pre></div> <ol> <li>The domain's cert (public key).</li> <li>The domain's private key. </li> </ol> <p>For more information on how to create a TLS secret see: <a href=https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets target=_blank>https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets</a>.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, the cluster URL need not be provided as it will be the same as the control-plane URL. </p> </div> <h3 id=prometheus>Prometheus<a class=headerlink href=#prometheus title="Permanent link">&para;</a></h3> <p>If not already installed on your cluster, install the full <code>kube-prometheus-stack</code> through the <a href=https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack target=_blank>Prometheus community Operator</a>. </p> <div class="admonition note"> <p class=admonition-title>Note</p> <ul> <li>If Prometheus has been installed on the cluster in the past, even if it was uninstalled (such as when upgrading from Run:ai 2.8 or lower), you will need to update Prometheus CRDs as described <a href=https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#upgrading-chart target=_blank>here</a>. For more information on the Prometheus bug see <a href=https://github.com/prometheus-community/helm-charts/issues/2753 target=_blank>here</a>.</li> <li>If you are running Kubernetes 1.21, you must install a Prometheus stack version of 45.23.0 or lower. Use the <code>--version</code> flag below. Alternatively, use helm version 3.12 or later. For more information on the related Prometheus bug see <a href=https://github.com/prometheus-community/helm-charts/issues/3436 target=_blank>here</a></li> </ul> </div> <p>Then install the Prometheus stack by running:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-5-1 name=__codelineno-5-1 href=#__codelineno-5-1></a>helm<span class=w> </span>repo<span class=w> </span>add<span class=w> </span>prometheus-community<span class=w> </span>https://prometheus-community.github.io/helm-charts
+</code></pre></div> <ol> <li>The domain's cert (public key).</li> <li>The domain's private key. </li> </ol> <p>For more information on how to create a TLS secret see: <a href=https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets target=_blank>https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets</a>.</p> <div class="admonition note"> <p class=admonition-title>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, the cluster URL need not be provided as it will be the same as the control-plane URL. </p> </div> <h3 id=prometheus>Prometheus<a class=headerlink href=#prometheus title="Permanent link">&para;</a></h3> <p>If not already installed on your cluster, install the full <code>kube-prometheus-stack</code> through the <a href=https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack target=_blank>Prometheus community Operator</a>. </p> <div class="admonition note"> <p class=admonition-title>Note</p> <ul> <li>If Prometheus has been installed on the cluster in the past, even if it was uninstalled (such as when upgrading from Run:ai 2.8 or lower), you will need to update Prometheus CRDs as described <a href=https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#upgrading-chart target=_blank>here</a>. For more information on the Prometheus bug see <a href=https://github.com/prometheus-community/helm-charts/issues/2753 target=_blank>here</a>.</li> <li>If you are running Kubernetes 1.21, you must install a Prometheus stack version of 45.23.0 or lower. Use the <code>--version</code> flag below. Alternatively, use Helm version 3.12 or later. For more information on the related Prometheus bug see <a href=https://github.com/prometheus-community/helm-charts/issues/3436 target=_blank>here</a></li> </ul> </div> <p>Then install the Prometheus stack by running:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-5-1 name=__codelineno-5-1 href=#__codelineno-5-1></a>helm<span class=w> </span>repo<span class=w> </span>add<span class=w> </span>prometheus-community<span class=w> </span>https://prometheus-community.github.io/helm-charts
 <a id=__codelineno-5-2 name=__codelineno-5-2 href=#__codelineno-5-2></a>helm<span class=w> </span>repo<span class=w> </span>update
 <a id=__codelineno-5-3 name=__codelineno-5-3 href=#__codelineno-5-3></a>helm<span class=w> </span>install<span class=w> </span>prometheus<span class=w> </span>prometheus-community/kube-prometheus-stack<span class=w> </span><span class=se>\</span>
 <a id=__codelineno-5-4 name=__codelineno-5-4 href=#__codelineno-5-4></a><span class=w>    </span>-n<span class=w> </span>monitoring<span class=w> </span>--create-namespace<span class=w> </span>--set<span class=w> </span>grafana.enabled<span class=o>=</span><span class=nb>false</span><span class=w> </span><span class=c1># (1)</span>
-</code></pre></div> <ol> <li>The Grafana component is not required for Run:ai. </li> </ol> <h2 id=optional-software-requirements>Optional Software Requirements<a class=headerlink href=#optional-software-requirements title="Permanent link">&para;</a></h2> <p>The following software enables specific features of Run:ai</p> <h3 id=distributed-training>Distributed Training<a class=headerlink href=#distributed-training title="Permanent link">&para;</a></h3> <p>Run:ai supports three different methods to distributed-training jobs across multiple nodes:</p> <ul> <li>MPI</li> <li>TensorFlow</li> <li>PyTorch</li> </ul> <p>To install all 3 prerequisites run the following:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-6-1 name=__codelineno-6-1 href=#__codelineno-6-1></a>kubectl apply -k &quot;github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0&quot;
+</code></pre></div> <ol> <li>The Grafana component is not required for Run:ai. </li> </ol> <h2 id=optional-software-requirements>Optional Software Requirements<a class=headerlink href=#optional-software-requirements title="Permanent link">&para;</a></h2> <p>The following software enables specific features of Run:ai</p> <h3 id=distributed-training>Distributed Training<a class=headerlink href=#distributed-training title="Permanent link">&para;</a></h3> <p>Run:ai supports three different methods to distributed-training jobs across multiple nodes:</p> <ul> <li>MPI</li> <li>TensorFlow</li> <li>PyTorch</li> </ul> <p>To install all three, run the following:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-6-1 name=__codelineno-6-1 href=#__codelineno-6-1></a>kubectl apply -k &quot;github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0&quot;
 </code></pre></div> <h3 id=inference>Inference<a class=headerlink href=#inference title="Permanent link">&para;</a></h3> <p>To use the Run:ai inference module you must pre-install <a href=https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/ target=_blank>Knative Serving</a>. Follow the instructions <a href=https://knative.dev/docs/install/ target=_blank>here</a> to install. Run:ai is certified on Knative 1.4 to 1.8 with Kubernetes 1.22 or later. </p> <p>Post-install, you must configure Knative to use the Run:ai scheduler and allow pod affinity, by running: </p> <div class=highlight><pre><span></span><code><a id=__codelineno-7-1 name=__codelineno-7-1 href=#__codelineno-7-1></a>kubectl patch configmap/config-features \
 <a id=__codelineno-7-2 name=__codelineno-7-2 href=#__codelineno-7-2></a>  --namespace knative-serving \
 <a id=__codelineno-7-3 name=__codelineno-7-3 href=#__codelineno-7-3></a>  --type merge \
 <a id=__codelineno-7-4 name=__codelineno-7-4 href=#__codelineno-7-4></a>  --patch &#39;{&quot;data&quot;:{&quot;kubernetes.podspec-schedulername&quot;:&quot;enabled&quot;,&quot;kubernetes.podspec-affinity&quot;:&quot;enabled&quot;}}&#39;
 </code></pre></div> <h4 id=inference-autoscaling>Inference Autoscaling<a class=headerlink href=#inference-autoscaling title="Permanent link">&para;</a></h4> <p>Run:ai allows to autoscale a deployment according to various metrics:</p> <ol> <li>GPU Utilization (%)</li> <li>CPU Utilization (%)</li> <li>Latency (milliseconds)</li> <li>Throughput (requests/second)</li> <li>Concurrency </li> <li>Any custom metric</li> </ol> <p>Additional installation may be needed for some of the metrics as follows:</p> <ul> <li>Using <em>Throughput</em> or <em>Concurrency</em> does not require any additional installation.</li> <li>Any other metric will require installing the <a href=https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-optional-serving-extensions target=_blank>HPA Autoscaler</a>.</li> <li>Using <em>GPU Utilization</em>, <em>Latency</em> or <em>Custom metric</em> will <strong>also</strong> require the Prometheus adapter. The Prometheus adapter is part of the Run:ai installer and can be added by setting the <code>prometheus-adapter.enabled</code> flag to <code>true</code>. See <a href=../customize-cluster-install/ >Customizing the Run:ai installation</a> for further information.</li> </ul> <p>If you wish to use an <em>existing</em> Prometheus adapter installation, you will need to configure it manually with the Run:ai Prometheus rules, specified in the Run:ai chart values under <code>prometheus-adapter.rules</code> field. For further information please contact Run:ai customer support. </p> <h4 id=accessing-inference-from-outside-the-cluster>Accessing Inference from outside the Cluster<a class=headerlink href=#accessing-inference-from-outside-the-cluster title="Permanent link">&para;</a></h4> <p>Inference workloads will typically be accessed by consumers residing outside the cluster. You will hence want to provide consumers with a URL to access the workload. The URL can be found in the Run:ai user interface under the deployment screen (alternatively, run <code>kubectl get ksvc -n &lt;project-namespace&gt;</code>). </p> <p>However, for the URL to be accessible outside the cluster you must configure your DNS as described <a href=https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#configure-dns target=_blank>here</a>.</p> <details> <summary>Alternative Configuration</summary> <p>When the above DNS configuration is not possible, you can manually add the <code>Host</code> header to the REST request as follows:</p> <ul> <li>Get an <code>&lt;external-ip&gt;</code> by running <code>kubectl get service -n kourier-system kourier</code>. If you have been using <em>istio</em> during Run:ai installation, run: <code>kubectl -n istio-system get service istio-ingressgateway</code> instead. </li> <li>Send a request to your workload by using the external ip, and place the workload url as a <code>Host</code> header. For example</li> </ul> <div class=highlight><pre><span></span><code><a id=__codelineno-8-1 name=__codelineno-8-1 href=#__codelineno-8-1></a>curl http://&lt;external-ip&gt;/&lt;container-specific-path&gt;
 <a id=__codelineno-8-2 name=__codelineno-8-2 href=#__codelineno-8-2></a>    -H &#39;Host: &lt;host-name&gt;&#39;
-</code></pre></div> </details> <h2 id=hardware-requirements>Hardware Requirements<a class=headerlink href=#hardware-requirements title="Permanent link">&para;</a></h2> <p>(see picture below)</p> <ul> <li> <p>(Production only) <strong>Run:ai System Nodes</strong>: To reduce downtime and save CPU cycles on expensive GPU Machines, we recommend that production deployments will contain <strong>two or more</strong> worker machines, designated for Run:ai Software. The nodes do not have to be dedicated to Run:ai, but for Run:ai purposes we would need:</p> <ul> <li>8 CPUs</li> <li>16GB of RAM</li> <li>50GB of Disk space </li> </ul> </li> <li> <p><strong>Shared data volume:</strong> Run:ai uses Kubernetes to abstract away the machine on which a container is running:</p> <ul> <li>Researcher containers: The Researcher's containers need to be able to access data from any machine in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts. </li> <li>The Run:ai system needs to save data on a storage device that is not dependent on a specific node. </li> </ul> <p>Typically, this is achieved via Network File Storage (NFS) or Network-attached storage (NAS).</p> </li> <li> <p><strong>Docker Registry:</strong> With Run:ai, Workloads are based on Docker images. For container images to run on any machine, these images must be downloaded from a docker registry rather than reside on the local machine (though this also is <a href=../../../researcher-setup/docker-to-runai/#image-repository>possible</a>). You can use a public registry such as <a href=https://hub.docker.com/ target=_blank>docker hub</a> or set up a local registry on-prem (preferably on a dedicated machine). Run:ai can assist with setting up the repository.</p> </li> <li> <p><strong>Kubernetes:</strong> Production Kubernetes installation requires separate nodes for the Kubernetes master. For more details see your specific Kubernetes distribution documentation. </p> </li> </ul> <p><a class=glightbox href=../img/prerequisites.jpg data-type=image data-width=100% data-height=auto data-desc-position=bottom><img alt=img/prerequisites.png src=../img/prerequisites.jpg></a></p> <h2 id=user-requirements>User requirements<a class=headerlink href=#user-requirements title="Permanent link">&para;</a></h2> <p><strong>Usage of containers and images:</strong> The individual Researcher's work must be based on <a href=https://www.docker.com/resources/what-container target=_blank>container</a> images. </p> <h2 id=network-access-requirements>Network Access Requirements<a class=headerlink href=#network-access-requirements title="Permanent link">&para;</a></h2> <p><strong>Internal networking:</strong> Kubernetes networking is an add-on rather than a core part of Kubernetes. Different add-ons have different network requirements. You should consult the documentation of the specific add-on on which ports to open. It is however important to note that unless special provisions are made, Kubernetes assumes <strong>all</strong> cluster nodes can interconnect using <strong>all</strong> ports. </p> <p><strong>Outbound network:</strong> Run:ai user interface runs from the cloud. All container nodes must be able to connect to the Run:ai cloud. Inbound connectivity (connecting from the cloud into nodes) is not required. If outbound connectivity is limited, the following exceptions should be applied: </p> <h3 id=during-installation>During Installation<a class=headerlink href=#during-installation title="Permanent link">&para;</a></h3> <p>Run:ai requires an installation over the Kubernetes cluster. The installation access the web to download various images and registries. Some organizations place limitations on what you can pull from the internet. The following list shows the various solution components and their origin: </p> <table border=1 style="width: 650px; margin-left: 0px; margin-right: auto;"> <tbody> <tr> <th scope=row style="width: 114.375px;">Name</th> <th scope=row style="width: 308.92px;">Description</th> <th scope=row style="width: 227.102px;">URLs</th> <th scope=row style="width: 43.4659px;">Ports</th> </tr> <tr> <td style="padding: 6px; width: 104.375px;"> <p>Run:ai Repository</p> </td> <td style="padding: 6px; width: 298.92px;"> <p> Run:ai Helm Package Repository </p> </td> <td style="padding: 6px; width: 217.102px;"> <p> <a href=http://runai-charts.storage.googleapis.com/ >runai-charts.storage.googleapis.com</a> </p> </td> <td style="padding: 6px; width: 33.4659px;"> <p>443</p> </td> </tr> <tr> <td style="padding: 6px; width: 104.375px;"> <p>Docker Images Repository</p> </td> <td style="padding: 6px; width: 298.92px;"> <p>Run:ai images</p> </td> <td style="padding: 6px; width: 217.102px;"> gcr.io/run-ai-prod </td> <td style="padding: 6px; width: 33.4659px;"> <p>443</p> </td> </tr> <tr> <td style="padding: 6px; width: 104.375px;"> <p> Docker Images Repository </p> </td> <td style="padding: 6px; width: 298.92px;"> <p> Third party Images</p> </td> <td style="padding: 6px; width: 217.102px;"> <p><a href=http://hub.docker.com/ >hub.docker.com </a></p> <p><a href=http://quay.io/ >quay.io</a> </p> </td> <td style="padding: 6px; width: 33.4659px;"> <p> 443 </p> </td> </tr> <tr> <td style="padding: 6px; width: 106px;"> <p> Run:ai </p> </td> <td style="padding: 6px; width: 304px;"> <p> Run:ai Cloud instance </p> </td> <td style="padding: 6px; width: 205px;"> <p> <a href=https://app.run.ai>app.run.ai</a> </p> <p> </p> </td> <td style="padding: 6px; width: 32px;"> <p>443, 53</p> </tbody> </table> <h3 id=post-installation>Post Installation<a class=headerlink href=#post-installation title="Permanent link">&para;</a></h3> <p>In addition, once running, Run:ai requires an outbound network connection to the following targets:</p> <table border=1 style="margin-left: 0px; margin-right: auto; width: 650px;"> <tbody> <tr style="height: 22px;"> <th scope=row style="width: 116px; height: 22px;">Name</th> <th scope=row style="width: 314px; height: 22px;">Description</th> <th scope=row style="width: 215px; height: 22px;">URLs</th> <th scope=row style="width: 42px; height: 22px;">Ports</th> </tr> <tr> <td style="padding: 6px; width: 106px;"> <p>Grafana</p> </td> <td style="padding: 6px; width: 304px;"> <p>Grafana Metrics Server</p> </td> <td style="padding: 6px; width: 205px;"> <p> <a href=https://prometheus-us-central1.grafana.net>prometheus-us-central1.grafana.net</a> and <a href=https://runailabs.com>runailabs.com</a> </p> </td> <td style="padding: 6px; width: 32px;"> <p>443 </p> </td> </tr> <tr> <td style="padding: 6px; width: 106px;"> <p> Run:ai </p> </td> <td style="padding: 6px; width: 304px;"> <p> Run:ai Cloud instance </p> </td> <td style="padding: 6px; width: 205px;"> <p> <a href=https://app.run.ai>app.run.ai</a> </p> <p> </p> </td> <td style="padding: 6px; width: 32px;"> <p>443, 53</p> </td> </tr> </tbody> </table> <h3 id=network-proxy>Network Proxy<a class=headerlink href=#network-proxy title="Permanent link">&para;</a></h3> <p>If you are using a Proxy for outbound communication please contact Run:ai customer support</p> <h2 id=pre-install-script>Pre-install Script<a class=headerlink href=#pre-install-script title="Permanent link">&para;</a></h2> <p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai <a href=https://github.com/run-ai/preinstall-diagnostics target=_blank>pre-install diagnostics script</a>. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script <a href=https://github.com/run-ai/preinstall-diagnostics/releases target=_blank>download</a> the latest version of the script and run:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-9-1 name=__codelineno-9-1 href=#__codelineno-9-1></a>chmod +x preinstall-diagnostics-&lt;platform&gt;
+</code></pre></div> </details> <h2 id=hardware-requirements>Hardware Requirements<a class=headerlink href=#hardware-requirements title="Permanent link">&para;</a></h2> <p>(see picture below)</p> <ul> <li> <p>(Production only) <strong>Run:ai System Nodes</strong>: To reduce downtime and save CPU cycles on expensive GPU Machines, we recommend that production deployments will contain <strong>two or more</strong> worker machines, designated for Run:ai Software. The nodes do not have to be dedicated to Run:ai, but for Run:ai purposes we would need:</p> <ul> <li>8 CPUs</li> <li>16GB of RAM</li> <li>50GB of Disk space </li> </ul> </li> <li> <p><strong>Shared data volume:</strong> Run:ai uses Kubernetes to abstract away the machine on which a container is running:</p> <ul> <li>Researcher containers: The Researcher's containers need to be able to access data from any machine in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts. </li> <li>The Run:ai system needs to save data on a storage device that is not dependent on a specific node. </li> </ul> <p>Typically, this is achieved via Kubernetes Storage class based on Network File Storage (NFS) or Network-attached storage (NAS). </p> </li> <li> <p><strong>Docker Registry:</strong> With Run:ai, Workloads are based on Docker images. For container images to run on any machine, these images must be downloaded from a docker registry rather than reside on the local machine (though this also is <a href=../../../researcher-setup/docker-to-runai/#image-repository>possible</a>). You can use a public registry such as <a href=https://hub.docker.com/ target=_blank>docker hub</a> or set up a local registry on-prem (preferably on a dedicated machine). Run:ai can assist with setting up the repository.</p> </li> <li> <p><strong>Kubernetes:</strong> Production Kubernetes installation requires separate nodes for the Kubernetes master. For more details see your specific Kubernetes distribution documentation. </p> </li> </ul> <p><a class=glightbox href=../img/prerequisites.jpg data-type=image data-width=100% data-height=auto data-desc-position=bottom><img alt=img/prerequisites.png src=../img/prerequisites.jpg></a></p> <h2 id=user-requirements>User requirements<a class=headerlink href=#user-requirements title="Permanent link">&para;</a></h2> <p><strong>Usage of containers and images:</strong> The individual Researcher's work must be based on <a href=https://www.docker.com/resources/what-container target=_blank>container</a> images. </p> <h2 id=network-access-requirements>Network Access Requirements<a class=headerlink href=#network-access-requirements title="Permanent link">&para;</a></h2> <p><strong>Internal networking:</strong> Kubernetes networking is an add-on rather than a core part of Kubernetes. Different add-ons have different network requirements. You should consult the documentation of the specific add-on on which ports to open. It is however important to note that unless special provisions are made, Kubernetes assumes <strong>all</strong> cluster nodes can interconnect using <strong>all</strong> ports. </p> <p><strong>Outbound network:</strong> Run:ai user interface runs from the cloud. All container nodes must be able to connect to the Run:ai cloud. Inbound connectivity (connecting from the cloud into nodes) is not required. If outbound connectivity is limited, the following exceptions should be applied: </p> <h3 id=during-installation>During Installation<a class=headerlink href=#during-installation title="Permanent link">&para;</a></h3> <p>Run:ai requires an installation over the Kubernetes cluster. The installation access the web to download various images and registries. Some organizations place limitations on what you can pull from the internet. The following list shows the various solution components and their origin: </p> <table> <thead> <tr> <th>Name</th> <th>Description</th> <th>URLs</th> <th>Ports</th> </tr> </thead> <tbody> <tr> <td>Run:ai Repository</td> <td>Run:ai Helm Package Repository</td> <td><a href=http://runai-charts.storage.googleapis.com/ >runai-charts.storage.googleapis.com</a></td> <td>443</td> </tr> <tr> <td>Docker Images Repository</td> <td>Run:ai images</td> <td>gcr.io/run-ai-prod</td> <td>443</td> </tr> <tr> <td>Docker Images Repository</td> <td>Third party Images</td> <td><a href=http://hub.docker.com/ >hub.docker.com </a> and <a href=http://quay.io/ >quay.io</a></td> <td>443</td> </tr> <tr> <td>Run:ai</td> <td>Run:ai Cloud instance</td> <td><a href=https://app.run.ai>app.run.ai</a></td> <td></td> </tr> </tbody> </table> <h3 id=post-installation>Post Installation<a class=headerlink href=#post-installation title="Permanent link">&para;</a></h3> <p>In addition, once running, Run:ai requires an outbound network connection to the following targets:</p> <table> <thead> <tr> <th>Name</th> <th>Description</th> <th>URLs</th> <th>Ports</th> </tr> </thead> <tbody> <tr> <td>Grafana</td> <td>Grafana Metrics Server</td> <td><a href=https://prometheus-us-central1.grafana.net>prometheus-us-central1.grafana.net</a> and <a href=https://runailabs.com>runailabs.com</a></td> <td>443</td> </tr> <tr> <td>Run:ai</td> <td>Run:ai Cloud instance</td> <td><a href=https://app.run.ai>app.run.ai</a></td> <td>443, 53</td> </tr> </tbody> </table> <h3 id=network-proxy>Network Proxy<a class=headerlink href=#network-proxy title="Permanent link">&para;</a></h3> <p>If you are using a Proxy for outbound communication please contact Run:ai customer support</p> <h2 id=pre-install-script>Pre-install Script<a class=headerlink href=#pre-install-script title="Permanent link">&para;</a></h2> <p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai <a href=https://github.com/run-ai/preinstall-diagnostics target=_blank>pre-install diagnostics script</a>. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script <a href=https://github.com/run-ai/preinstall-diagnostics/releases target=_blank>download</a> the latest version of the script and run:</p> <div class=highlight><pre><span></span><code><a id=__codelineno-9-1 name=__codelineno-9-1 href=#__codelineno-9-1></a>chmod +x preinstall-diagnostics-&lt;platform&gt;
 <a id=__codelineno-9-2 name=__codelineno-9-2 href=#__codelineno-9-2></a>./preinstall-diagnostics-&lt;platform&gt;
-</code></pre></div> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see <a href=https://github.com/run-ai/preinstall-diagnostics target=_blank>here</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-25T14:27:33+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-25</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <p>If the script shows warnings or errors, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see <a href=https://github.com/run-ai/preinstall-diagnostics target=_blank>here</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-28T09:12:29+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-28</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2020-07-19T12:51:50+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2020-07-19</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html b/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html
index 730b349058..ef59c45e06 100644
--- a/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/ rel=canonical><link href=../../try-azure/ rel=prev><link href=../cluster-prerequisites/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>SaaS Cluster Setup Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/ rel=canonical><link href=../../try-azure/ rel=prev><link href=../cluster-prerequisites/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>SaaS Cluster Setup Introduction - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/index.html b/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/index.html
index 0b68c151ea..d436a653fc 100644
--- a/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/ rel=canonical><link href=../customize-cluster-install/ rel=prev><link href=../cluster-delete/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Cluster Upgrade - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/ rel=canonical><link href=../customize-cluster-install/ rel=prev><link href=../cluster-delete/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Cluster Upgrade - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/index.html b/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/index.html
index 284b067ef7..47f4f9b7cd 100644
--- a/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/ rel=canonical><link href=../cluster-install/ rel=prev><link href=../cluster-upgrade/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Customize Installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/ rel=canonical><link href=../cluster-install/ rel=prev><link href=../cluster-upgrade/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Customize Installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/index.html b/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/index.html
index 239a64b366..ab3bbe9433 100644
--- a/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/ rel=canonical><link href=../install-k8s/ rel=prev><link href=../../self-hosted/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>NVIDIA DGX Bundle - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/ rel=canonical><link href=../install-k8s/ rel=prev><link href=../../self-hosted/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>NVIDIA DGX Bundle - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/cluster-setup/install-k8s/index.html b/v2.13/admin/runai-setup/cluster-setup/install-k8s/index.html
index f50032f79b..b6b70a268e 100644
--- a/v2.13/admin/runai-setup/cluster-setup/install-k8s/index.html
+++ b/v2.13/admin/runai-setup/cluster-setup/install-k8s/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/install-k8s/ rel=canonical><link href=../cluster-delete/ rel=prev><link href=../dgx-bundle/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Kubernetes Install - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/install-k8s/ rel=canonical><link href=../cluster-delete/ rel=prev><link href=../dgx-bundle/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Kubernetes Install - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/access-roles/index.html b/v2.13/admin/runai-setup/config/access-roles/index.html
index 1f9e338a76..cfbcc8341a 100644
--- a/v2.13/admin/runai-setup/config/access-roles/index.html
+++ b/v2.13/admin/runai-setup/config/access-roles/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/access-roles/ rel=canonical><link href=../node-roles/ rel=prev><link href=../allow-external-access-to-containers/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Review Kubernetes Access provided to Run:ai - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/access-roles/ rel=canonical><link href=../node-roles/ rel=prev><link href=../allow-external-access-to-containers/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Review Kubernetes Access provided to Run:ai - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/allow-external-access-to-containers/index.html b/v2.13/admin/runai-setup/config/allow-external-access-to-containers/index.html
index 8122480d78..bfa64c89cf 100644
--- a/v2.13/admin/runai-setup/config/allow-external-access-to-containers/index.html
+++ b/v2.13/admin/runai-setup/config/allow-external-access-to-containers/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/allow-external-access-to-containers/ rel=canonical><link href=../access-roles/ rel=prev><link href=../non-root-containers/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>External access to Containers - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/allow-external-access-to-containers/ rel=canonical><link href=../access-roles/ rel=prev><link href=../non-root-containers/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>External access to Containers - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/cli-admin-install/index.html b/v2.13/admin/runai-setup/config/cli-admin-install/index.html
index 9e20d20024..d1cd351fff 100644
--- a/v2.13/admin/runai-setup/config/cli-admin-install/index.html
+++ b/v2.13/admin/runai-setup/config/cli-admin-install/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/cli-admin-install/ rel=canonical><link href=../non-root-containers/ rel=prev><link href=../dr/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install Administrator CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/cli-admin-install/ rel=canonical><link href=../non-root-containers/ rel=prev><link href=../dr/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install Administrator CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/dr/index.html b/v2.13/admin/runai-setup/config/dr/index.html
index 36e6407180..751ddc6924 100644
--- a/v2.13/admin/runai-setup/config/dr/index.html
+++ b/v2.13/admin/runai-setup/config/dr/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/dr/ rel=canonical><link href=../cli-admin-install/ rel=prev><link href=../node-affinity-with-cloud-node-pools/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Disaster Recovery - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/dr/ rel=canonical><link href=../cli-admin-install/ rel=prev><link href=../node-affinity-with-cloud-node-pools/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Disaster Recovery - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/index.html b/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/index.html
index 5183fcd25c..427359a6fb 100644
--- a/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/index.html
+++ b/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/ rel=canonical><link href=../dr/ rel=prev><link href=../../authentication/authentication-overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Node Affinity with Cloud Node Pools - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/ rel=canonical><link href=../dr/ rel=prev><link href=../../authentication/authentication-overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Node Affinity with Cloud Node Pools - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/node-roles/index.html b/v2.13/admin/runai-setup/config/node-roles/index.html
index 51de53010d..d3c8cd02ad 100644
--- a/v2.13/admin/runai-setup/config/node-roles/index.html
+++ b/v2.13/admin/runai-setup/config/node-roles/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/node-roles/ rel=canonical><link href=../overview/ rel=prev><link href=../access-roles/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Set Node Roles - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/node-roles/ rel=canonical><link href=../overview/ rel=prev><link href=../access-roles/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Set Node Roles - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/non-root-containers/index.html b/v2.13/admin/runai-setup/config/non-root-containers/index.html
index fa6116e43f..52c72497db 100644
--- a/v2.13/admin/runai-setup/config/non-root-containers/index.html
+++ b/v2.13/admin/runai-setup/config/non-root-containers/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/non-root-containers/ rel=canonical><link href=../allow-external-access-to-containers/ rel=prev><link href=../cli-admin-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>User Identity in Container - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/non-root-containers/ rel=canonical><link href=../allow-external-access-to-containers/ rel=prev><link href=../cli-admin-install/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>User Identity in Container - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/config/overview/index.html b/v2.13/admin/runai-setup/config/overview/index.html
index a2c627937f..92734001d0 100644
--- a/v2.13/admin/runai-setup/config/overview/index.html
+++ b/v2.13/admin/runai-setup/config/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/overview/ rel=canonical><link href=../../self-hosted/ocp/uninstall/ rel=prev><link href=../node-roles/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Run:ai Configuration Articles - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/config/overview/ rel=canonical><link href=../../self-hosted/ocp/uninstall/ rel=prev><link href=../node-roles/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Run:ai Configuration Articles - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/installation-types/index.html b/v2.13/admin/runai-setup/installation-types/index.html
index d015856285..e5ffcb013a 100644
--- a/v2.13/admin/runai-setup/installation-types/index.html
+++ b/v2.13/admin/runai-setup/installation-types/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/installation-types/ rel=canonical><link href=../../overview-administrator/ rel=prev><link href=../try-azure/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Installation Types - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/installation-types/ rel=canonical><link href=../../overview-administrator/ rel=prev><link href=../try-azure/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Installation Types - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/maintenance/audit-log/index.html b/v2.13/admin/runai-setup/maintenance/audit-log/index.html
index f1b187a07d..b973610b66 100644
--- a/v2.13/admin/runai-setup/maintenance/audit-log/index.html
+++ b/v2.13/admin/runai-setup/maintenance/audit-log/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/audit-log/ rel=canonical><link href=../monitoring/ rel=prev><link href=../../../researcher-setup/researcher-setup-intro/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Audit Log - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/audit-log/ rel=canonical><link href=../monitoring/ rel=prev><link href=../../../researcher-setup/researcher-setup-intro/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Audit Log - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/maintenance/monitoring/index.html b/v2.13/admin/runai-setup/maintenance/monitoring/index.html
index e67aa3dd17..681f02642f 100644
--- a/v2.13/admin/runai-setup/maintenance/monitoring/index.html
+++ b/v2.13/admin/runai-setup/maintenance/monitoring/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/monitoring/ rel=canonical><link href=../node-downtime/ rel=prev><link href=../audit-log/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Monitoring Cluster Health - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/monitoring/ rel=canonical><link href=../node-downtime/ rel=prev><link href=../audit-log/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Monitoring Cluster Health - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/maintenance/node-downtime/index.html b/v2.13/admin/runai-setup/maintenance/node-downtime/index.html
index c8a1c12ccc..6e5aa16970 100644
--- a/v2.13/admin/runai-setup/maintenance/node-downtime/index.html
+++ b/v2.13/admin/runai-setup/maintenance/node-downtime/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/node-downtime/ rel=canonical><link href=../../authentication/sso/ rel=prev><link href=../monitoring/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Node Downtime - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/maintenance/node-downtime/ rel=canonical><link href=../../authentication/sso/ rel=prev><link href=../monitoring/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Node Downtime - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html
index 9602a7b62e..33dde23886 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/ rel=canonical><link href=../cluster/ rel=prev><link href=../project-management/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install additional Clusters - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/ rel=canonical><link href=../cluster/ rel=prev><link href=../project-management/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install additional Clusters - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/backend/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/backend/index.html
index 99d86479af..01bcd0a394 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/backend/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/backend/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/backend/ rel=canonical><link href=../preparations/ rel=prev><link href=../cluster/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install Control Plane - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/backend/ rel=canonical><link href=../preparations/ rel=prev><link href=../cluster/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install Control Plane - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -22,7 +22,7 @@
 </code></pre></div> <ol> <li>Domain name described <a href=../prerequisites/#domain-name>here</a>. </li> </ol> <div class="admonition info"> <p class=admonition-title>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-backend</code>.</p> </div> </div> <div class=tabbed-block> <div class=highlight><pre><span></span><code><a id=__codelineno-2-1 name=__codelineno-2-1 href=#__codelineno-2-1></a>helm<span class=w> </span>upgrade<span class=w> </span>-i<span class=w> </span>runai-backend<span class=w> </span>control-plane-&lt;VERSION&gt;.tgz<span class=w>  </span><span class=se>\ </span><span class=c1># (1)</span>
 <a id=__codelineno-2-2 name=__codelineno-2-2 href=#__codelineno-2-2></a><span class=w>    </span>--set<span class=w> </span>global.domain<span class=o>=</span>&lt;DOMAIN&gt;<span class=w>  </span><span class=c1># (2)</span>
 <a id=__codelineno-2-3 name=__codelineno-2-3 href=#__codelineno-2-3></a><span class=w>    </span>-n<span class=w> </span>runai-backend<span class=w> </span>-f<span class=w> </span>custom-env.yaml<span class=w>  </span><span class=c1># (3)</span>
-</code></pre></div> <ol> <li>Replace <code>&lt;VERSION&gt;</code> with the Run:ai control plane version.</li> <li>Domain name described <a href=../prerequisites/#domain-name>here</a>. </li> <li><code>custom-env.yaml</code> should have been created by the <em>prepare installation</em> script in the previous section. </li> </ol> </div> </div> </div> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>Use the <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p> </div> <h2 id=optional-additional-configurations>(Optional) Additional Configurations<a class=headerlink href=#optional-additional-configurations title="Permanent link">&para;</a></h2> <p>There may be cases where you need to set additional properties as follows:</p> <table> <thead> <tr> <th>Key</th> <th>Change</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>keycloakx.adminUser</code></td> <td>User name of the internal identity provider administrator</td> <td>This user is the administrator of Keycloak</td> </tr> <tr> <td><code>keycloakx.adminPassword</code></td> <td>Password of the internal identity provider administrator</td> <td>This password is for the administrator of Keycloak</td> </tr> <tr> <td><code>global.ingress.ingressClass</code></td> <td>Ingress class</td> <td>Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai</td> </tr> <tr> <td><code>global.ingress.tlsSecretName</code></td> <td>TLS secret name</td> <td>Run:ai requires the creation of a secret with domain certificate. See <a href=#domain-certificate>above</a>. If the <code>runai-backend</code> namespace already had such a secret, you can set the secret name here</td> </tr> <tr> <td><code>global.postgresql.auth.username</code></td> <td>PostgreSQL username</td> <td>Override the Run:ai default user name for the Run:ai database</td> </tr> <tr> <td><code>global.postgresql.auth.password</code></td> <td>PostgreSQL password</td> <td>Override the Run:ai default password for the Run:ai database</td> </tr> <tr> <td><code>grafana.adminUser</code></td> <td>Grafana username</td> <td>Override the Run:ai default user name for accessing Grafana</td> </tr> <tr> <td><code>grafana.adminPassword</code></td> <td>Grafana password</td> <td>Override the Run:ai default password for accessing Grafana</td> </tr> <tr> <td><code>global.imagePullSecrets:</code> <br> &ensp; <code>- name: &lt;secret-name&gt;</code></td> <td>Docker secret</td> <td>Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments</td> </tr> <tr> <td><code>&lt;component&gt;</code> <br> &ensp;<code>resources:</code> <br> &emsp; <code>limits:</code> <br> &emsp; &ensp; <code>cpu: 500m</code> <br> &emsp; &ensp; <code>memory: 512Mi</code> <br> &emsp; <code>requests:</code> <br> &emsp; &ensp; <code>cpu: 250m</code> <br> &emsp; &ensp; <code>memory: 256Mi</code></td> <td>Pod request and limits</td> <td><code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code></td> </tr> <tr> <td><div style=width:200px></div></td> <td></td> <td></td> </tr> </tbody> </table> <p>Use the <code>--set</code> syntax in the helm command above. </p> <h3 id=connect-to-runai-user-interface>Connect to Run:ai User Interface<a class=headerlink href=#connect-to-runai-user-interface title="Permanent link">&para;</a></h3> <p>Go to: <code>runai.&lt;company-name&gt;</code>. Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. Go to the Users area and change the password. </p> <h2 id=optional-enable-forgot-password>(Optional) Enable "Forgot password"<a class=headerlink href=#optional-enable-forgot-password title="Permanent link">&para;</a></h2> <p>To support the “Forgot password” functionality, follow the steps below.</p> <ul> <li>Go to <code>runai.&lt;company-name&gt;/auth</code> and Log in. </li> <li>Under <code>Realm settings</code>, select the <code>Login</code> tab and enable the <code>Forgot password</code> feature.</li> <li>Under the <code>Email</code> tab, define an SMTP server, as explained <a href=https://www.keycloak.org/docs/latest/server_admin/#_email target=_blank>here</a></li> </ul> <h2 id=next-steps>Next Steps<a class=headerlink href=#next-steps title="Permanent link">&para;</a></h2> <p>Continue with installing a <a href=../cluster/ >Run:ai Cluster</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-14T08:25:26+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-14</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-08-03T14:22:10+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-08-03</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <ol> <li>Replace <code>&lt;VERSION&gt;</code> with the Run:ai control plane version.</li> <li>Domain name described <a href=../prerequisites/#domain-name>here</a>. </li> <li><code>custom-env.yaml</code> should have been created by the <em>prepare installation</em> script in the previous section. </li> </ol> </div> </div> </div> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>Use the <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p> </div> <h2 id=optional-additional-configurations>(Optional) Additional Configurations<a class=headerlink href=#optional-additional-configurations title="Permanent link">&para;</a></h2> <p>There may be cases where you need to set additional properties as follows:</p> <table> <thead> <tr> <th>Key</th> <th>Change</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>keycloakx.adminUser</code></td> <td>User name of the internal identity provider administrator</td> <td>This user is the administrator of Keycloak</td> </tr> <tr> <td><code>keycloakx.adminPassword</code></td> <td>Password of the internal identity provider administrator</td> <td>This password is for the administrator of Keycloak</td> </tr> <tr> <td><code>global.ingress.ingressClass</code></td> <td>Ingress class</td> <td>Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai</td> </tr> <tr> <td><code>global.ingress.tlsSecretName</code></td> <td>TLS secret name</td> <td>Run:ai requires the creation of a secret with domain certificate. See <a href=#domain-certificate>above</a>. If the <code>runai-backend</code> namespace already had such a secret, you can set the secret name here</td> </tr> <tr> <td><code>global.postgresql.auth.username</code></td> <td>PostgreSQL username</td> <td>Override the Run:ai default user name for the Run:ai database</td> </tr> <tr> <td><code>global.postgresql.auth.password</code></td> <td>PostgreSQL password</td> <td>Override the Run:ai default password for the Run:ai database</td> </tr> <tr> <td><code>grafana.adminUser</code></td> <td>Grafana username</td> <td>Override the Run:ai default user name for accessing Grafana</td> </tr> <tr> <td><code>grafana.adminPassword</code></td> <td>Grafana password</td> <td>Override the Run:ai default password for accessing Grafana</td> </tr> <tr> <td><code>thanos.receive.persistence.storageClass</code> and <code>postgresql.primary.persistence.storageClass</code></td> <td>Storage class</td> <td>The installation to work with a specific storage class rather than the default one</td> </tr> <tr> <td><code>global.imagePullSecrets:</code> <br> &ensp; <code>- name: &lt;secret-name&gt;</code></td> <td>Docker secret</td> <td>Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments</td> </tr> <tr> <td><code>&lt;component&gt;</code> <br> &ensp;<code>resources:</code> <br> &emsp; <code>limits:</code> <br> &emsp; &ensp; <code>cpu: 500m</code> <br> &emsp; &ensp; <code>memory: 512Mi</code> <br> &emsp; <code>requests:</code> <br> &emsp; &ensp; <code>cpu: 250m</code> <br> &emsp; &ensp; <code>memory: 256Mi</code></td> <td>Pod request and limits</td> <td><code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code></td> </tr> <tr> <td><div style=width:200px></div></td> <td></td> <td></td> </tr> </tbody> </table> <p>Use the <code>--set</code> syntax in the helm command above. </p> <h3 id=connect-to-runai-user-interface>Connect to Run:ai User Interface<a class=headerlink href=#connect-to-runai-user-interface title="Permanent link">&para;</a></h3> <p>Go to: <code>runai.&lt;company-name&gt;</code>. Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. Go to the Users area and change the password. </p> <h2 id=optional-enable-forgot-password>(Optional) Enable "Forgot password"<a class=headerlink href=#optional-enable-forgot-password title="Permanent link">&para;</a></h2> <p>To support the “Forgot password” functionality, follow the steps below.</p> <ul> <li>Go to <code>runai.&lt;company-name&gt;/auth</code> and Log in. </li> <li>Under <code>Realm settings</code>, select the <code>Login</code> tab and enable the <code>Forgot password</code> feature.</li> <li>Under the <code>Email</code> tab, define an SMTP server, as explained <a href=https://www.keycloak.org/docs/latest/server_admin/#_email target=_blank>here</a></li> </ul> <h2 id=next-steps>Next Steps<a class=headerlink href=#next-steps title="Permanent link">&para;</a></h2> <p>Continue with installing a <a href=../cluster/ >Run:ai Cluster</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-26T14:26:16+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-26</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-08-03T14:22:10+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-08-03</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/cluster/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/cluster/index.html
index 0903397659..227df6c10c 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/cluster/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/cluster/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/cluster/ rel=canonical><link href=../backend/ rel=prev><link href=../additional-clusters/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over Kubernetes - Cluster Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/cluster/ rel=canonical><link href=../backend/ rel=prev><link href=../additional-clusters/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over Kubernetes - Cluster Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/index.html
index 90649927ba..99558e4469 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/ rel=canonical><link href=../project-management/ rel=prev><link href=../upgrade/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over Kubernetes - Next Steps - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/ rel=canonical><link href=../project-management/ rel=prev><link href=../upgrade/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over Kubernetes - Next Steps - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/preparations/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/preparations/index.html
index cefe65d854..960b445b9b 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/preparations/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/preparations/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/preparations/ rel=canonical><link href=../prerequisites/ rel=prev><link href=../backend/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self-Hosted Installation over Kubernetes - Preparations - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/preparations/ rel=canonical><link href=../prerequisites/ rel=prev><link href=../backend/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self-Hosted Installation over Kubernetes - Preparations - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/index.html
index 2c2425bae0..3eb590a231 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/ rel=canonical><link href=../../overview/ rel=prev><link href=../preparations/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/ rel=canonical><link href=../../overview/ rel=prev><link href=../preparations/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/project-management/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/project-management/index.html
index 054d493b17..01b1afc803 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/project-management/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/project-management/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/project-management/ rel=canonical><link href=../additional-clusters/ rel=prev><link href=../next-steps/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over Kubernetes - Create Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/project-management/ rel=canonical><link href=../additional-clusters/ rel=prev><link href=../next-steps/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over Kubernetes - Create Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/index.html
index 363ab60127..0866cfce47 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/ rel=canonical><link href=../upgrade/ rel=prev><link href=../../ocp/prerequisites/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Uninstall self-hosted Kubernetes installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/ rel=canonical><link href=../upgrade/ rel=prev><link href=../../ocp/prerequisites/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Uninstall self-hosted Kubernetes installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/index.html b/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/index.html
index 86667a17f9..f820c83f61 100644
--- a/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/ rel=canonical><link href=../next-steps/ rel=prev><link href=../uninstall/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Upgrade self-hosted Kubernetes installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/ rel=canonical><link href=../next-steps/ rel=prev><link href=../uninstall/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Upgrade self-hosted Kubernetes installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html
index ecad9437d6..448ca886bb 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/ rel=canonical><link href=../cluster/ rel=prev><link href=../project-management/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install additional Clusters - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/ rel=canonical><link href=../cluster/ rel=prev><link href=../project-management/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install additional Clusters - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/backend/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/backend/index.html
index 25b00be49f..0b57aeb590 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/backend/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/backend/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/backend/ rel=canonical><link href=../preparations/ rel=prev><link href=../cluster/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Install Control Plane - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/backend/ rel=canonical><link href=../preparations/ rel=prev><link href=../cluster/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Install Control Plane - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
@@ -23,7 +23,7 @@
 <a id=__codelineno-1-2 name=__codelineno-1-2 href=#__codelineno-1-2></a><span class=w>    </span>--set<span class=w> </span>global.domain<span class=o>=</span>runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt;<span class=w> </span><span class=se>\ </span><span class=c1># (1)</span>
 <a id=__codelineno-1-3 name=__codelineno-1-3 href=#__codelineno-1-3></a><span class=w>    </span>--set<span class=w> </span>global.config.kubernetesDistribution<span class=o>=</span>openshift<span class=w> </span><span class=se>\</span>
 <a id=__codelineno-1-4 name=__codelineno-1-4 href=#__codelineno-1-4></a><span class=w>    </span>--set<span class=w> </span>backend.config.openshiftIdpFirstAdmin<span class=o>=</span>&lt;FIRST_ADMIN_USER_OF_RUNAI&gt;<span class=w>  </span><span class=c1># (2)</span>
-</code></pre></div> <ol> <li>The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run <code>oc get routes -A</code></li> <li>Name of the administrator user in the company directory.</li> </ol> <p>(replace <code>&lt;version&gt;</code> with the control plane version)</p> </div> </div> </div> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>Use the <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p> </div> <h2 id=optional-additional-configurations>(Optional) Additional Configurations<a class=headerlink href=#optional-additional-configurations title="Permanent link">&para;</a></h2> <p>There may be cases where you need to set additional properties as follows:</p> <table> <thead> <tr> <th>Key</th> <th>Change</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>keycloakx.adminUser</code></td> <td>User name of the internal identity provider administrator</td> <td>This user is the administrator of Keycloak</td> </tr> <tr> <td><code>keycloakx.adminPassword</code></td> <td>Password of the internal identity provider administrator</td> <td>This password is for the administrator of Keycloak</td> </tr> <tr> <td><code>global.postgresql.auth.username</code></td> <td>PostgreSQL username</td> <td>Override the Run:ai default user name for the Run:ai database</td> </tr> <tr> <td><code>global.postgresql.auth.password</code></td> <td>PostgreSQL password</td> <td>Override the Run:ai default password for the Run:ai database</td> </tr> <tr> <td><code>grafana.adminUser</code></td> <td>Grafana username</td> <td>Override the Run:ai default user name for accessing Grafana</td> </tr> <tr> <td><code>grafana.adminPassword</code></td> <td>Grafana password</td> <td>Override the Run:ai default password for accessing Grafana</td> </tr> <tr> <td><code>global.imagePullSecrets:</code> <br> &ensp; <code>- name: &lt;secret-name&gt;</code></td> <td>Docker secret</td> <td>Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments</td> </tr> <tr> <td><code>&lt;component&gt;</code> <br> &ensp;<code>resources:</code> <br> &emsp; <code>limits:</code> <br> &emsp; &ensp; <code>cpu: 500m</code> <br> &emsp; &ensp; <code>memory: 512Mi</code> <br> &emsp; <code>requests:</code> <br> &emsp; &ensp; <code>cpu: 250m</code> <br> &emsp; &ensp; <code>memory: 256Mi</code></td> <td>Pod request and limits</td> <td><code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code></td> </tr> <tr> <td><div style=width:200px></div></td> <td></td> <td></td> </tr> </tbody> </table> <h2 id=connect-to-runai-user-interface>Connect to Run:ai User Interface<a class=headerlink href=#connect-to-runai-user-interface title="Permanent link">&para;</a></h2> <ul> <li>Run: <code>oc get routes -n runai-backend</code> to find the Run:ai Administration User Interface URL. </li> <li>Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. </li> <li>Go to the Users area and change the password. </li> </ul> <h2 id=next-steps>Next Steps<a class=headerlink href=#next-steps title="Permanent link">&para;</a></h2> <p>Continue with installing a <a href=../cluster/ >Run:ai Cluster</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-14T10:03:41+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-14</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-08-03T14:22:10+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-08-03</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+</code></pre></div> <ol> <li>The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run <code>oc get routes -A</code></li> <li>Name of the administrator user in the company directory.</li> </ol> <p>(replace <code>&lt;version&gt;</code> with the control plane version)</p> </div> </div> </div> <div class="admonition tip"> <p class=admonition-title>Tip</p> <p>Use the <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p> </div> <h2 id=optional-additional-configurations>(Optional) Additional Configurations<a class=headerlink href=#optional-additional-configurations title="Permanent link">&para;</a></h2> <p>There may be cases where you need to set additional properties as follows:</p> <table> <thead> <tr> <th>Key</th> <th>Change</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>keycloakx.adminUser</code></td> <td>User name of the internal identity provider administrator</td> <td>This user is the administrator of Keycloak</td> </tr> <tr> <td><code>keycloakx.adminPassword</code></td> <td>Password of the internal identity provider administrator</td> <td>This password is for the administrator of Keycloak</td> </tr> <tr> <td><code>global.postgresql.auth.username</code></td> <td>PostgreSQL username</td> <td>Override the Run:ai default user name for the Run:ai database</td> </tr> <tr> <td><code>global.postgresql.auth.password</code></td> <td>PostgreSQL password</td> <td>Override the Run:ai default password for the Run:ai database</td> </tr> <tr> <td><code>grafana.adminUser</code></td> <td>Grafana username</td> <td>Override the Run:ai default user name for accessing Grafana</td> </tr> <tr> <td><code>grafana.adminPassword</code></td> <td>Grafana password</td> <td>Override the Run:ai default password for accessing Grafana</td> </tr> <tr> <td><code>thanos.receive.persistence.storageClass</code> and <code>postgresql.primary.persistence.storageClass</code></td> <td>Storage class</td> <td>The installation to work with a specific storage class rather than the default one</td> </tr> <tr> <td><code>global.imagePullSecrets:</code> <br> &ensp; <code>- name: &lt;secret-name&gt;</code></td> <td>Docker secret</td> <td>Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments</td> </tr> <tr> <td><code>&lt;component&gt;</code> <br> &ensp;<code>resources:</code> <br> &emsp; <code>limits:</code> <br> &emsp; &ensp; <code>cpu: 500m</code> <br> &emsp; &ensp; <code>memory: 512Mi</code> <br> &emsp; <code>requests:</code> <br> &emsp; &ensp; <code>cpu: 250m</code> <br> &emsp; &ensp; <code>memory: 256Mi</code></td> <td>Pod request and limits</td> <td><code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code></td> </tr> <tr> <td><div style=width:200px></div></td> <td></td> <td></td> </tr> </tbody> </table> <h2 id=connect-to-runai-user-interface>Connect to Run:ai User Interface<a class=headerlink href=#connect-to-runai-user-interface title="Permanent link">&para;</a></h2> <ul> <li>Run: <code>oc get routes -n runai-backend</code> to find the Run:ai Administration User Interface URL. </li> <li>Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. </li> <li>Go to the Users area and change the password. </li> </ul> <h2 id=next-steps>Next Steps<a class=headerlink href=#next-steps title="Permanent link">&para;</a></h2> <p>Continue with installing a <a href=../cluster/ >Run:ai Cluster</a>.</p> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-26T14:26:16+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-26</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2021-08-03T14:22:10+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2021-08-03</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/cluster/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/cluster/index.html
index ef46beaf7b..a6e1b14374 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/cluster/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/cluster/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/cluster/ rel=canonical><link href=../backend/ rel=prev><link href=../additional-clusters/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self-Hosted installation over OpenShift - Cluster Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/cluster/ rel=canonical><link href=../backend/ rel=prev><link href=../additional-clusters/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self-Hosted installation over OpenShift - Cluster Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/index.html
index d6099a1f6a..dab93272b2 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/ rel=canonical><link href=../project-management/ rel=prev><link href=../upgrade/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over OpenShift - Next Steps - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/ rel=canonical><link href=../project-management/ rel=prev><link href=../upgrade/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over OpenShift - Next Steps - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/preparations/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/preparations/index.html
index e48ba7ff9d..5279e68e13 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/preparations/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/preparations/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/preparations/ rel=canonical><link href=../prerequisites/ rel=prev><link href=../backend/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over OpenShift - Preparations - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/preparations/ rel=canonical><link href=../prerequisites/ rel=prev><link href=../backend/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over OpenShift - Preparations - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/index.html
index 33c2743bb0..067e499f0f 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/ rel=canonical><link href=../../k8s/uninstall/ rel=prev><link href=../preparations/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over OpenShift - Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/ rel=canonical><link href=../../k8s/uninstall/ rel=prev><link href=../preparations/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over OpenShift - Prerequisites - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/project-management/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/project-management/index.html
index b13a6fa603..499837f831 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/project-management/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/project-management/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/project-management/ rel=canonical><link href=../additional-clusters/ rel=prev><link href=../next-steps/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted installation over OpenShift - Create Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/project-management/ rel=canonical><link href=../additional-clusters/ rel=prev><link href=../next-steps/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted installation over OpenShift - Create Projects - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/index.html
index 9be42eb0d4..c35fa337a6 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/ rel=canonical><link href=../upgrade/ rel=prev><link href=../../../config/overview/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Uninstall self-hosted OpenShift installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/ rel=canonical><link href=../upgrade/ rel=prev><link href=../../../config/overview/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Uninstall self-hosted OpenShift installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/index.html b/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/index.html
index 77979df8f1..51bce52e5d 100644
--- a/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/ rel=canonical><link href=../next-steps/ rel=prev><link href=../uninstall/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Upgrade self-hosted OpenShift installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/ rel=canonical><link href=../next-steps/ rel=prev><link href=../uninstall/ rel=next><link rel=icon href=../../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Upgrade self-hosted OpenShift installation - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../../css/timeago.css><link rel=stylesheet href=../../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/self-hosted/overview/index.html b/v2.13/admin/runai-setup/self-hosted/overview/index.html
index 4ece02ac11..539d11f25d 100644
--- a/v2.13/admin/runai-setup/self-hosted/overview/index.html
+++ b/v2.13/admin/runai-setup/self-hosted/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/overview/ rel=canonical><link href=../../cluster-setup/dgx-bundle/ rel=prev><link href=../k8s/prerequisites/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Self Hosted Run:ai Installation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/overview/ rel=canonical><link href=../../cluster-setup/dgx-bundle/ rel=prev><link href=../k8s/prerequisites/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Self Hosted Run:ai Installation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/runai-setup/try-azure/index.html b/v2.13/admin/runai-setup/try-azure/index.html
index 27f43b1bbf..9171f18ef5 100644
--- a/v2.13/admin/runai-setup/try-azure/index.html
+++ b/v2.13/admin/runai-setup/try-azure/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/try-azure/ rel=canonical><link href=../installation-types/ rel=prev><link href=../cluster-setup/cluster-setup-intro/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Try on the Cloud - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/runai-setup/try-azure/ rel=canonical><link href=../installation-types/ rel=prev><link href=../cluster-setup/cluster-setup-intro/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Try on the Cloud - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/troubleshooting/cluster-health-check/index.html b/v2.13/admin/troubleshooting/cluster-health-check/index.html
index ab505bbd5d..d2bbd73321 100644
--- a/v2.13/admin/troubleshooting/cluster-health-check/index.html
+++ b/v2.13/admin/troubleshooting/cluster-health-check/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/cluster-health-check/ rel=canonical><link href=../../admin-ui-setup/deployments/ rel=prev><link href=../troubleshooting/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Cluster Health - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/cluster-health-check/ rel=canonical><link href=../../admin-ui-setup/deployments/ rel=prev><link href=../troubleshooting/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Cluster Health - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/troubleshooting/diagnostics/index.html b/v2.13/admin/troubleshooting/diagnostics/index.html
index 6d47a4dc52..6fcc199463 100644
--- a/v2.13/admin/troubleshooting/diagnostics/index.html
+++ b/v2.13/admin/troubleshooting/diagnostics/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/diagnostics/ rel=canonical><link href=../troubleshooting/ rel=prev><link href=../../researcher-setup/docker-to-runai/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Diagnostics - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/diagnostics/ rel=canonical><link href=../troubleshooting/ rel=prev><link href=../../researcher-setup/docker-to-runai/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Diagnostics - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/troubleshooting/troubleshooting/index.html b/v2.13/admin/troubleshooting/troubleshooting/index.html
index d59c305713..0dc3547ca7 100644
--- a/v2.13/admin/troubleshooting/troubleshooting/index.html
+++ b/v2.13/admin/troubleshooting/troubleshooting/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/troubleshooting/ rel=canonical><link href=../cluster-health-check/ rel=prev><link href=../diagnostics/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Troubleshooting - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/troubleshooting/troubleshooting/ rel=canonical><link href=../cluster-health-check/ rel=prev><link href=../diagnostics/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Troubleshooting - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/workloads/inference-overview/index.html b/v2.13/admin/workloads/inference-overview/index.html
index 15b737bde8..642f16327a 100644
--- a/v2.13/admin/workloads/inference-overview/index.html
+++ b/v2.13/admin/workloads/inference-overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/inference-overview/ rel=canonical><link href=../secrets/ rel=prev><link href=../../admin-ui-setup/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Inference - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/inference-overview/ rel=canonical><link href=../secrets/ rel=prev><link href=../../admin-ui-setup/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Inference - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/workloads/policies/index.html b/v2.13/admin/workloads/policies/index.html
index 5ab3bed2f2..e0878ea216 100644
--- a/v2.13/admin/workloads/policies/index.html
+++ b/v2.13/admin/workloads/policies/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/policies/ rel=canonical><link href=../workload-overview-admin/ rel=prev><link href=../secrets/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Policies - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/policies/ rel=canonical><link href=../workload-overview-admin/ rel=prev><link href=../secrets/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Policies - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/workloads/secrets/index.html b/v2.13/admin/workloads/secrets/index.html
index 6de6fe6ed3..6a9f6a8220 100644
--- a/v2.13/admin/workloads/secrets/index.html
+++ b/v2.13/admin/workloads/secrets/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/secrets/ rel=canonical><link href=../policies/ rel=prev><link href=../inference-overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Secrets - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/secrets/ rel=canonical><link href=../policies/ rel=prev><link href=../inference-overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Secrets - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/admin/workloads/workload-overview-admin/index.html b/v2.13/admin/workloads/workload-overview-admin/index.html
index 9b4b0dda25..c92e05440d 100644
--- a/v2.13/admin/workloads/workload-overview-admin/index.html
+++ b/v2.13/admin/workloads/workload-overview-admin/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/workload-overview-admin/ rel=canonical><link href=../../researcher-setup/limit-to-node-group/ rel=prev><link href=../policies/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/admin/workloads/workload-overview-admin/ rel=canonical><link href=../../researcher-setup/limit-to-node-group/ rel=prev><link href=../policies/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/admin-rest-api/overview/index.html b/v2.13/developer/admin-rest-api/overview/index.html
index 619b6d8d45..5062cfe8b6 100644
--- a/v2.13/developer/admin-rest-api/overview/index.html
+++ b/v2.13/developer/admin-rest-api/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/admin-rest-api/overview/ rel=canonical><link href=../../cluster-api/reference/inference/ rel=prev><link href=../../metrics/metrics/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Administrator REST API Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/admin-rest-api/overview/ rel=canonical><link href=../../cluster-api/reference/inference/ rel=prev><link href=../../metrics/metrics/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Administrator REST API Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/other-resources/index.html b/v2.13/developer/cluster-api/other-resources/index.html
index 822e69bba7..20dbc43baa 100644
--- a/v2.13/developer/cluster-api/other-resources/index.html
+++ b/v2.13/developer/cluster-api/other-resources/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/other-resources/ rel=canonical><link href=../submit-cron-yaml/ rel=prev><link href=../reference/training/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Kubernetes Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/other-resources/ rel=canonical><link href=../submit-cron-yaml/ rel=prev><link href=../reference/training/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Kubernetes Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/reference/distributed/index.html b/v2.13/developer/cluster-api/reference/distributed/index.html
index 9db11ba450..bee37550de 100644
--- a/v2.13/developer/cluster-api/reference/distributed/index.html
+++ b/v2.13/developer/cluster-api/reference/distributed/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/distributed/ rel=canonical><link href=../interactive/ rel=prev><link href=../inference/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Distributed Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/distributed/ rel=canonical><link href=../interactive/ rel=prev><link href=../inference/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Distributed Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/reference/inference/index.html b/v2.13/developer/cluster-api/reference/inference/index.html
index 4f2d498af6..78499ccd93 100644
--- a/v2.13/developer/cluster-api/reference/inference/index.html
+++ b/v2.13/developer/cluster-api/reference/inference/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/inference/ rel=canonical><link href=../distributed/ rel=prev><link href=../../../admin-rest-api/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Inference Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/inference/ rel=canonical><link href=../distributed/ rel=prev><link href=../../../admin-rest-api/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Inference Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/reference/interactive/index.html b/v2.13/developer/cluster-api/reference/interactive/index.html
index 33c07a552d..95dd10c702 100644
--- a/v2.13/developer/cluster-api/reference/interactive/index.html
+++ b/v2.13/developer/cluster-api/reference/interactive/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/interactive/ rel=canonical><link href=../training/ rel=prev><link href=../distributed/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Interactive Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/interactive/ rel=canonical><link href=../training/ rel=prev><link href=../distributed/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Interactive Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/reference/training/index.html b/v2.13/developer/cluster-api/reference/training/index.html
index d44fc02ada..eb117f0378 100644
--- a/v2.13/developer/cluster-api/reference/training/index.html
+++ b/v2.13/developer/cluster-api/reference/training/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/training/ rel=canonical><link href=../../other-resources/ rel=prev><link href=../interactive/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/reference/training/ rel=canonical><link href=../../other-resources/ rel=prev><link href=../interactive/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Training Workloads - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/submit-cron-yaml/index.html b/v2.13/developer/cluster-api/submit-cron-yaml/index.html
index e71cb894ca..b8470f6da9 100644
--- a/v2.13/developer/cluster-api/submit-cron-yaml/index.html
+++ b/v2.13/developer/cluster-api/submit-cron-yaml/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-cron-yaml/ rel=canonical><link href=../submit-rest/ rel=prev><link href=../other-resources/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit CRON job via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-cron-yaml/ rel=canonical><link href=../submit-rest/ rel=prev><link href=../other-resources/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit CRON job via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/submit-rest/index.html b/v2.13/developer/cluster-api/submit-rest/index.html
index db6578c2fb..1477837138 100644
--- a/v2.13/developer/cluster-api/submit-rest/index.html
+++ b/v2.13/developer/cluster-api/submit-rest/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-rest/ rel=canonical><link href=../submit-yaml/ rel=prev><link href=../submit-cron-yaml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit Workload via HTTP/REST - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-rest/ rel=canonical><link href=../submit-yaml/ rel=prev><link href=../submit-cron-yaml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit Workload via HTTP/REST - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/submit-yaml/index.html b/v2.13/developer/cluster-api/submit-yaml/index.html
index b99b719aca..0275472159 100644
--- a/v2.13/developer/cluster-api/submit-yaml/index.html
+++ b/v2.13/developer/cluster-api/submit-yaml/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-yaml/ rel=canonical><link href=../workload-overview-dev/ rel=prev><link href=../submit-rest/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit Workload via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/submit-yaml/ rel=canonical><link href=../workload-overview-dev/ rel=prev><link href=../submit-rest/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit Workload via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/cluster-api/workload-overview-dev/index.html b/v2.13/developer/cluster-api/workload-overview-dev/index.html
index 4a4a59721b..ad86b7f817 100644
--- a/v2.13/developer/cluster-api/workload-overview-dev/index.html
+++ b/v2.13/developer/cluster-api/workload-overview-dev/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/workload-overview-dev/ rel=canonical><link href=../../rest-auth/ rel=prev><link href=../submit-yaml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Workloads Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/cluster-api/workload-overview-dev/ rel=canonical><link href=../../rest-auth/ rel=prev><link href=../submit-yaml/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Workloads Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/inference/overview/index.html b/v2.13/developer/deprecated/inference/overview/index.html
index 9028593ceb..24dc771798 100644
--- a/v2.13/developer/deprecated/inference/overview/index.html
+++ b/v2.13/developer/deprecated/inference/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/overview/ rel=canonical><link href=../../k8s-api/launch-job-via-kubernetes-api/ rel=prev><link href=../setup/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/overview/ rel=canonical><link href=../../k8s-api/launch-job-via-kubernetes-api/ rel=prev><link href=../setup/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/inference/setup/index.html b/v2.13/developer/deprecated/inference/setup/index.html
index 13bd495bd9..9720bdf73d 100644
--- a/v2.13/developer/deprecated/inference/setup/index.html
+++ b/v2.13/developer/deprecated/inference/setup/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/setup/ rel=canonical><link href=../overview/ rel=prev><link href=../submit-via-cli/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/setup/ rel=canonical><link href=../overview/ rel=prev><link href=../submit-via-cli/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Setup - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/inference/submit-via-cli/index.html b/v2.13/developer/deprecated/inference/submit-via-cli/index.html
index 12d18041b3..7395497b88 100644
--- a/v2.13/developer/deprecated/inference/submit-via-cli/index.html
+++ b/v2.13/developer/deprecated/inference/submit-via-cli/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/submit-via-cli/ rel=canonical><link href=../setup/ rel=prev><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit via CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/inference/submit-via-cli/ rel=canonical><link href=../setup/ rel=prev><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit via CLI - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/index.html b/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/index.html
index 3eab1a5b6c..02fcd8e02e 100644
--- a/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/index.html
+++ b/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ rel=canonical><link href=../launch-job-via-yaml/ rel=prev><link href=../../inference/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit a Job via Kubernetes API - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ rel=canonical><link href=../launch-job-via-yaml/ rel=prev><link href=../../inference/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit a Job via Kubernetes API - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/index.html b/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/index.html
index 45bcc7b979..0d8fe45514 100644
--- a/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/index.html
+++ b/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/ rel=canonical><link href=../overview/ rel=prev><link href=../launch-job-via-kubernetes-api/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Submit a Job via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/ rel=canonical><link href=../overview/ rel=prev><link href=../launch-job-via-kubernetes-api/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Submit a Job via YAML - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/k8s-api/overview/index.html b/v2.13/developer/deprecated/k8s-api/overview/index.html
index 0d5c8608aa..2b9e4cc41c 100644
--- a/v2.13/developer/deprecated/k8s-api/overview/index.html
+++ b/v2.13/developer/deprecated/k8s-api/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/overview/ rel=canonical><link href=../../researcher-rest-api/overview/ rel=prev><link href=../launch-job-via-yaml/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/k8s-api/overview/ rel=canonical><link href=../../researcher-rest-api/overview/ rel=prev><link href=../launch-job-via-yaml/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/deprecated/researcher-rest-api/overview/index.html b/v2.13/developer/deprecated/researcher-rest-api/overview/index.html
index ef30386a48..680f0e48be 100644
--- a/v2.13/developer/deprecated/researcher-rest-api/overview/index.html
+++ b/v2.13/developer/deprecated/researcher-rest-api/overview/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/researcher-rest-api/overview/ rel=canonical><link href=../../../metrics/metrics/ rel=prev><link href=../../k8s-api/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>REST API - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/deprecated/researcher-rest-api/overview/ rel=canonical><link href=../../../metrics/metrics/ rel=prev><link href=../../k8s-api/overview/ rel=next><link rel=icon href=../../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>REST API - Run:ai Documentation Library</title><link rel=stylesheet href=../../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../../css/timeago.css><link rel=stylesheet href=../../../../stylesheets/extra.css><script>__md_scope=new URL("../../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/metrics/metrics/index.html b/v2.13/developer/metrics/metrics/index.html
index 7490926856..9cf40045f8 100644
--- a/v2.13/developer/metrics/metrics/index.html
+++ b/v2.13/developer/metrics/metrics/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/metrics/metrics/ rel=canonical><link href=../../admin-rest-api/overview/ rel=prev><link href=../../deprecated/researcher-rest-api/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Metrics - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/metrics/metrics/ rel=canonical><link href=../../admin-rest-api/overview/ rel=prev><link href=../../deprecated/researcher-rest-api/overview/ rel=next><link rel=icon href=../../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Metrics - Run:ai Documentation Library</title><link rel=stylesheet href=../../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/overview-developer/index.html b/v2.13/developer/overview-developer/index.html
index ce1651c45a..b07e14023d 100644
--- a/v2.13/developer/overview-developer/index.html
+++ b/v2.13/developer/overview-developer/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/overview-developer/ rel=canonical><link href=../../Researcher/use-cases/ rel=prev><link href=../rest-auth/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Developer Documentation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/overview-developer/ rel=canonical><link href=../../Researcher/use-cases/ rel=prev><link href=../rest-auth/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Developer Documentation Overview - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/developer/rest-auth/index.html b/v2.13/developer/rest-auth/index.html
index 05ba9186be..4d784f7b30 100644
--- a/v2.13/developer/rest-auth/index.html
+++ b/v2.13/developer/rest-auth/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/rest-auth/ rel=canonical><link href=../overview-developer/ rel=prev><link href=../cluster-api/workload-overview-dev/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>API Authentication - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/developer/rest-auth/ rel=canonical><link href=../overview-developer/ rel=prev><link href=../cluster-api/workload-overview-dev/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>API Authentication - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/components/index.html b/v2.13/home/components/index.html
index f14603bb5e..54522e0a25 100644
--- a/v2.13/home/components/index.html
+++ b/v2.13/home/components/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/components/ rel=canonical><link rel=prev href=../..><link href=../whats-new-2-13/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>System Components - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/components/ rel=canonical><link rel=prev href=../..><link href=../whats-new-2-13/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>System Components - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/data-privacy-details/index.html b/v2.13/home/data-privacy-details/index.html
index 86fb3cd6c6..003b6a6a46 100644
--- a/v2.13/home/data-privacy-details/index.html
+++ b/v2.13/home/data-privacy-details/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/data-privacy-details/ rel=canonical><link href=../whats-new-2020/ rel=prev><link href=../../admin/overview-administrator/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Data Privacy - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/data-privacy-details/ rel=canonical><link href=../whats-new-2020/ rel=prev><link href=../../admin/overview-administrator/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Data Privacy - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2-10/index.html b/v2.13/home/whats-new-2-10/index.html
index 1b57590e7c..582bf03ab7 100644
--- a/v2.13/home/whats-new-2-10/index.html
+++ b/v2.13/home/whats-new-2-10/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-10/ rel=canonical><link href=../whats-new-2-12/ rel=prev><link href=../whats-new-2-9/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Version 2.10 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-10/ rel=canonical><link href=../whats-new-2-12/ rel=prev><link href=../whats-new-2-9/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Version 2.10 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2-12/index.html b/v2.13/home/whats-new-2-12/index.html
index 38803598d2..9ad477f5ca 100644
--- a/v2.13/home/whats-new-2-12/index.html
+++ b/v2.13/home/whats-new-2-12/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-12/ rel=canonical><link href=../whats-new-2-13/ rel=prev><link href=../whats-new-2-10/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Version 2.12 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-12/ rel=canonical><link href=../whats-new-2-13/ rel=prev><link href=../whats-new-2-10/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Version 2.12 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2-13/index.html b/v2.13/home/whats-new-2-13/index.html
index 975323dfd4..2ab5ff88d2 100644
--- a/v2.13/home/whats-new-2-13/index.html
+++ b/v2.13/home/whats-new-2-13/index.html
@@ -1,8 +1,8 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-13/ rel=canonical><link href=../components/ rel=prev><link href=../whats-new-2-12/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Version 2.13 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-13/ rel=canonical><link href=../components/ rel=prev><link href=../whats-new-2-12/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Version 2.13 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
-  })(window,document,'script','dataLayer','GTM-KRZ9ZCK');</script><!-- End Google Tag Manager --><script id=__analytics>function __md_analytics(){window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)},ga.l=+new Date,ga("create","UA-122651141-1","auto"),ga("set","anonymizeIp",!0),ga("send","pageview"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){var e;this.value&&(e=document.location.pathname,ga("send","pageview",e+"?q="+this.value))}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");ga("send","event","feedback","click",t,e),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){ga("send","pageview",e.pathname)})});var e=document.createElement("script");e.async=!0,e.src="https://www.google-analytics.com/analytics.js",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script><script>"undefined"!=typeof __md_analytics&&__md_analytics()</script><meta property=og:title content="Version 2.13"><meta property=og:description content=" | Run:ai version 2.13 |   Version 2.13.0.     Installation.     "><meta property=og:image content=https://docs.run.ai/home/img/RUNAI-LOGO-DIGITAL-2C_BP.png><link href=../../assets/stylesheets/glightbox.min.css rel=stylesheet><style>
+  })(window,document,'script','dataLayer','GTM-KRZ9ZCK');</script><!-- End Google Tag Manager --><script id=__analytics>function __md_analytics(){window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)},ga.l=+new Date,ga("create","UA-122651141-1","auto"),ga("set","anonymizeIp",!0),ga("send","pageview"),document.addEventListener("DOMContentLoaded",function(){document.forms.search&&document.forms.search.query.addEventListener("blur",function(){var e;this.value&&(e=document.location.pathname,ga("send","pageview",e+"?q="+this.value))}),document$.subscribe(function(){var a=document.forms.feedback;if(void 0!==a)for(var e of a.querySelectorAll("[type=submit]"))e.addEventListener("click",function(e){e.preventDefault();var t=document.location.pathname,e=this.getAttribute("data-md-value");ga("send","event","feedback","click",t,e),a.firstElementChild.disabled=!0;e=a.querySelector(".md-feedback__note [data-md-value='"+e+"']");e&&(e.hidden=!1)}),a.hidden=!1}),location$.subscribe(function(e){ga("send","pageview",e.pathname)})});var e=document.createElement("script");e.async=!0,e.src="https://www.google-analytics.com/analytics.js",document.getElementById("__analytics").insertAdjacentElement("afterEnd",e)}</script><script>"undefined"!=typeof __md_analytics&&__md_analytics()</script><meta property=og:title content="Version 2.13"><meta property=og:description content=" | Run:ai version 2.13 |   Version 2.13.7.     Version 2.13.1.     Version 2.13.0.     Installation.     "><meta property=og:image content=https://docs.run.ai/home/img/RUNAI-LOGO-DIGITAL-2C_BP.png><link href=../../assets/stylesheets/glightbox.min.css rel=stylesheet><style>
         html.glightbox-open { overflow: initial; height: 100%; }
         .gslide-title { margin-top: 0px; user-select: text; }
         .gslide-desc { color: #666; user-select: text; }
@@ -13,7 +13,7 @@
             body[data-md-color-scheme="slate"] .gdesc-inner { background: var(--md-default-bg-color);}
             body[data-md-color-scheme="slate"] .gslide-title { color: var(--md-default-fg-color);}
             body[data-md-color-scheme="slate"] .gslide-desc { color: var(--md-default-fg-color);}
-            </style><script src=../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#runai-version-213 class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Version 2.13 </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../.. class="md-tabs__link md-tabs__link--active"> Home </a> </li> <li class=md-tabs__item> <a href=../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1 checked> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=true> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3 checked> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=true> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Version 2.13 <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Version 2.13 </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#version-2130 class=md-nav__link> Version 2.13.0 </a> <nav class=md-nav aria-label="Version 2.13.0"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content class=md-nav__link> Release content </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#installation class=md-nav__link> Installation </a> <nav class=md-nav aria-label=Installation> <ul class=md-nav__list> <li class=md-nav__item> <a href=#known-issues class=md-nav__link> Known issues </a> </li> <li class=md-nav__item> <a href=#fixed-issues class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#version-2130 class=md-nav__link> Version 2.13.0 </a> <nav class=md-nav aria-label="Version 2.13.0"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content class=md-nav__link> Release content </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#installation class=md-nav__link> Installation </a> <nav class=md-nav aria-label=Installation> <ul class=md-nav__list> <li class=md-nav__item> <a href=#known-issues class=md-nav__link> Known issues </a> </li> <li class=md-nav__item> <a href=#fixed-issues class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/home/whats-new-2-13.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id=runai-version-213>Run:ai version 2.13<a class=headerlink href=#runai-version-213 title="Permanent link">&para;</a></h1> <h2 id=version-2130>Version 2.13.0<a class=headerlink href=#version-2130 title="Permanent link">&para;</a></h2> <h3 id=release-content>Release content<a class=headerlink href=#release-content title="Permanent link">&para;</a></h3> <p>This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes. For information about features, functionality, and fixed issues in previous versions see:</p> <ul> <li><a href=../whats-new-2-12/ >What's new 2.12</a></li> <li><a href=../whats-new-2-10/ >What's new 2.10</a></li> <li><a href=../whats-new-2-9/ >What's new 2.9</a></li> </ul> <p><strong>Projects</strong></p> <!-- RUN-9312/9313 Projects V2 --> <ul> <li>Improved the <strong>Projects</strong> UI for ease of use. <strong>Projects</strong> follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see <a href=../../admin/admin-ui-setup/project-setup/ >Projects</a>.</li> </ul> <p><strong>Dashboards</strong></p> <!-- RUN9530/9577 New Dashboard for Quota management --> <ul> <li> <p>Added a new dashboard for <strong>Quota management</strong>, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on <em>Departments</em>, <em>Projects</em>, and <em>Node pools</em>. For more information, see <a href=../../admin/admin-ui-setup/dashboard-analysis/#quota-management-dashboard>Quota management dashboard</a>.</p> </li> <li> <p>Added to the <strong>Overview dashboard</strong>, the ability to filter the cluster by one or more node pools. For more information, see <a href=../../Researcher/scheduling/using-node-pools/ >Node pools</a>.</p> </li> </ul> <p><strong>Nodes and Node pools</strong></p> <!-- RUN-9960/9961 Per node-pool GPU placement strategy --> <ul> <li> <p>Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see <a href=../../Researcher/scheduling/strategies/ >Scheduling strategies</a>. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see <a href=../../Researcher/scheduling/using-node-pools/#creating-new-node-pools>Creating new node pools</a>.</p> </li> <li> <p>GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See <a href=https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#metrics target=_blank>DCGM Metrics</a> for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.</p> </li> </ul> <!-- Hagay says no ticket number for this --> <ul> <li>Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see <a href=../../Researcher/scheduling/the-runai-scheduler/#over-quota-priority>Over-quota priority</a>.</li> </ul> <!-- RUN-9359/9360 Incorporating Node Pools in Workspaces --> <ul> <li>Added support of associating workspaces to node pool. The association between workspaces and node pools is done using <em>Compute resources</em> section. In order to associate a compute resource to a node pool, in the <em>Compute resource</em> section, press <em>More settings</em>. Press <em>Add new</em> to add more node pools to the configuration. Drag and drop the node pools to set their priority.</li> </ul> <!-- RUN-10287/10317/10313-10851 Show Node pools priority list according to workspace policy --> <ul> <li>Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</li> </ul> <p><strong>Time limit duration</strong></p> <ul> <li> <p>Improved the behavior of any workload time limit (for example, <em>Idle time limit</em>) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see <a href=../../admin/admin-ui-setup/project-setup/#limit-duration-of-interactive-and-training-jobs>Limit duration of interactive training jobs</a>.</p> </li> <li> <p>Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of <code>stopped</code> so that they can be reactivated later.</p> </li> <li> <p>Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated. </p> </li> </ul> <!-- Logically this part shopuld go above the integration. The flow of information is broken. Then we should have PVC Data Sources, Crenetials and Policies--> <p><strong>Workload assets</strong></p> <!-- RUN-8862/9292 - Department as a workspace asset creation scope - phase 1 --> <ul> <li>Extended the collaboration functionality for any workload asset such as <em>Environment</em>, <em>Compute resource</em>, and some <em>Data source types</em>. These assets are now shared with <strong>Departments</strong> in the organization in addition to being shared with specific projects, or the entire cluster.</li> </ul> <!-- RUN-9364/10850 Search box for cards in V2 assets --> <ul> <li>Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.</li> </ul> <p><strong>PVC data sources</strong></p> <!-- RUN-9826/10186 Support PVC from block storage --> <ul> <li>Added support for PVC block storage in the <em>New data source</em> form. In the <em>New data source</em> form for a new PVC data source, in the <em>Volume mode</em> field, select from <em>Filesystem</em> or <em>Block</em>. For more information, see <a href=../../Researcher/user-interface/workspaces/create/create-ds/#create-a-pvc-data-source>Create a PVC data source</a>.</li> </ul> <p><strong>Credentials</strong></p> <!-- RUN-9843/9852 - Allow researcher to create docker registry secrets --> <ul> <li>Added <em>Docker registry</em> to the <em>Credentials</em> menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see <a href=../../admin/admin-ui-setup/credentials-setup/#configuring-credentials>Configuring credentials</a>.</li> </ul> <!-- RUN-8453/8454/8927 Technical documentation of 'Projects new parameters and options' use existing namespace, status, and more added to projects v2--> <p><strong>Policies</strong></p> <!-- RUN-10588/10590 Allow workload policy to prevent the use of a new pvc --> <ul> <li>Improved policy support by adding <code>DEFAULTS</code> in the <code>items</code> section in the policy. The <code>DEFAULTS</code> section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, <a href=../../admin/workloads/policies/#complex-values>Complex values</a>.</li> </ul> <!-- RUN-8904/8960 - Cluster wide PVC in workspaces --> <ul> <li>Added support for making a PVC data source available to all projects. In the <em>New data source</em> form, when creating a new PVC data source, select <em>All</em> from the <em>Project</em> pane.</li> </ul> <p><strong>Researcher API</strong></p> <!-- RUN-8631/8880 Researcher API for train jobs --> <ul> <li>Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see <a href=../../developer/cluster-api/submit-rest/ >Submitting Workloads via HTTP/REST</a>.</li> </ul> <p><strong>Integrations</strong></p> <!-- RUN-9651/9652 Schedule and support of Elastic Jobs (Spark) --> <ul> <li>Added support for Spark and Elastic jobs. For more information, see <a href=../../admin/integration/spark/ >Running Spark jobs with Run:ai</a>.</li> </ul> <!-- RUN-9024/9027 Ray Support - schedule and support of Ray Jobs --> <ul> <li> <p>Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see <a href=../../admin/integration/ray/#integrate-runai-with-ray>Integrate Run:ai with Ray</a>.</p> </li> <li> <p>Added integration with Weights &amp; Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see <a href=../../admin/integration/weights-and-biases/#sweep-configuration>Sweep configuration</a>.</p> </li> </ul> <!-- RUN-9510 --> <ul> <li>Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see <a href=../../Researcher/cli-reference/runai-submit-dist-xgboost/ >runai submit-dist xgboost</a></li> </ul> <p><strong>Compatability</strong></p> <ul> <li>Added support for multiple OpenShift clusters. For configuration information, see <a href=../../admin/runai-setup/self-hosted/ocp/additional-clusters/ >Installing additional Clusters</a>.</li> </ul> <h2 id=installation>Installation<a class=headerlink href=#installation title="Permanent link">&para;</a></h2> <ul> <li>The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.</li> <li>From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a <em>backend values</em> file. Instead, install directly using <code>helm</code> as described in <a href=../../admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane>Install the Run:ai Control Plane</a>. </li> <li>From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a <code>custom-env.yaml</code> values file during the <a href=../../admin/runai-setup/self-hosted/k8s/preparations/#prepare-installation-artifacts>preparation</a> stage. This is used when installing the <a href=../../admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane>control-plane</a>.</li> </ul> <h3 id=known-issues>Known issues<a class=headerlink href=#known-issues title="Permanent link">&para;</a></h3> <table> <thead> <tr> <th align=left>Internal ID</th> <th align=left>Description</th> </tr> </thead> <tbody> <tr> <td align=left>RUN-11005</td> <td align=left>Incorrect error messages when trying to run <code>runai</code> CLI commands in an OpenShift environment.</td> </tr> <tr> <td align=left>RUN-11009</td> <td align=left>Incorrect error message when a user without permissions to tries to delete another user.</td> </tr> </tbody> </table> <h3 id=fixed-issues>Fixed issues<a class=headerlink href=#fixed-issues title="Permanent link">&para;</a></h3> <table> <thead> <tr> <th align=left>Internal ID</th> <th align=left>Description</th> </tr> </thead> <tbody> <tr> <td align=left>RUN-9039</td> <td align=left>Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible.</td> </tr> <tr> <td align=left>RUN-9323</td> <td align=left>Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful.</td> </tr> <tr> <td align=left>RUN-9324</td> <td align=left>Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready.</td> </tr> <tr> <td align=left>RUN-9902</td> <td align=left>Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn’t have permissions to monitor the <code>runai</code> namespace after an installation or upgrade to 2.9.</td> </tr> <tr> <td align=left>RUN-9920</td> <td align=left>Fixed an issue where the <code>canEdit</code> key in a policy is not validated properly for itemized fields when configuring an interactive policy.</td> </tr> <tr> <td align=left>RUN-10052</td> <td align=left>Fixed an issue when loading a new job from a template gives an error until there are changes made on the form.</td> </tr> <tr> <td align=left>RUN-10053</td> <td align=left>Fixed an issue where the Node pool column is unsearchable in the job list.</td> </tr> <tr> <td align=left>RUN-10422</td> <td align=left>Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.).</td> </tr> <tr> <td align=left>RUN-10500</td> <td align=left>Fixed an issue where jobs are shown as running even though they don't exist in the cluster.</td> </tr> <tr> <td align=left>RUN-10813</td> <td align=left>Fixed an issue in adding a <code>data source</code> where the path is case sensitive and didn't allow uppercase.</td> </tr> </tbody> </table> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-13T10:24:37+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-13</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-14T12:30:25+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-14</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
+            </style><script src=../../assets/javascripts/glightbox.min.js></script></head> <body dir=ltr> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#runai-version-213 class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <div data-md-color-scheme=default data-md-component=outdated hidden> </div> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-KRZ9ZCK" height=0 width=0 style=display:none;visibility:hidden></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <header class=md-header data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../.. title="Run:ai Documentation Library" class="md-header__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Run:ai Documentation Library </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Version 2.13 </span> </div> </div> </div> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <nav class=md-tabs aria-label=Tabs data-md-component=tabs> <div class=md-grid> <ul class=md-tabs__list> <li class=md-tabs__item> <a href=../.. class="md-tabs__link md-tabs__link--active"> Home </a> </li> <li class=md-tabs__item> <a href=../../admin/overview-administrator/ class=md-tabs__link> Administrator </a> </li> <li class=md-tabs__item> <a href=../../Researcher/overview-researcher/ class=md-tabs__link> Researcher </a> </li> <li class=md-tabs__item> <a href=../../developer/overview-developer/ class=md-tabs__link> Developer </a> </li> </ul> </div> </nav> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../.. title="Run:ai Documentation Library" class="md-nav__button md-logo" aria-label="Run:ai Documentation Library" data-md-component=logo> <img src=../../images/RUNAI-LOGO-DIGITAL-2C_WP.svg alt=logo> </a> Run:ai Documentation Library </label> <div class=md-nav__source> <a href=https://github.com/run-ai/docs/ title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> GitHub </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1 checked> <label class=md-nav__link for=__nav_1 id=__nav_1_label tabindex=0> Home <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_1_label aria-expanded=true> <label class=md-nav__title for=__nav_1> <span class="md-nav__icon md-icon"></span> Home </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../.. class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../components/ class=md-nav__link> System Components </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_1_3 checked> <label class=md-nav__link for=__nav_1_3 id=__nav_1_3_label tabindex=0> Whats New <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_1_3_label aria-expanded=true> <label class=md-nav__title for=__nav_1_3> <span class="md-nav__icon md-icon"></span> Whats New </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Version 2.13 <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Version 2.13 </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#version-2137 class=md-nav__link> Version 2.13.7 </a> <nav class=md-nav aria-label="Version 2.13.7"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-date class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content class=md-nav__link> Release content </a> </li> <li class=md-nav__item> <a href=#fixed-issues class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#release-date_1 class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#fixed-issues_1 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#version-2131 class=md-nav__link> Version 2.13.1 </a> <nav class=md-nav aria-label="Version 2.13.1"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-date_2 class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content_1 class=md-nav__link> Release content </a> </li> <li class=md-nav__item> <a href=#fixed-issues_2 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#version-2130 class=md-nav__link> Version 2.13.0 </a> <nav class=md-nav aria-label="Version 2.13.0"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content_2 class=md-nav__link> Release content </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#installation class=md-nav__link> Installation </a> <nav class=md-nav aria-label=Installation> <ul class=md-nav__list> <li class=md-nav__item> <a href=#known-issues class=md-nav__link> Known issues </a> </li> <li class=md-nav__item> <a href=#fixed-issues_3 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../whats-new-2-12/ class=md-nav__link> Version 2.12 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-10/ class=md-nav__link> Version 2.10 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-9/ class=md-nav__link> Version 2.9 </a> </li> <li class=md-nav__item> <a href=../whats-new-2-8/ class=md-nav__link> Version 2.8 </a> </li> <li class=md-nav__item> <a href=../whats-new-2022/ class=md-nav__link> Whats New 2022 </a> </li> <li class=md-nav__item> <a href=../whats-new-2021/ class=md-nav__link> Whats New 2021 </a> </li> <li class=md-nav__item> <a href=../whats-new-2020/ class=md-nav__link> Whats New 2020 </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../data-privacy-details/ class=md-nav__link> Data Privacy </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2> <label class=md-nav__link for=__nav_2 id=__nav_2_label tabindex=0> Administrator <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2> <span class="md-nav__icon md-icon"></span> Administrator </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/overview-administrator/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2> <label class=md-nav__link for=__nav_2_2 id=__nav_2_2_label tabindex=0> Run:ai Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2> <span class="md-nav__icon md-icon"></span> Run:ai Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/installation-types/ class=md-nav__link> Installation Types </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/try-azure/ class=md-nav__link> Try on the Cloud </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_3> <label class=md-nav__link for=__nav_2_2_3 id=__nav_2_2_3_label tabindex=0> Classic (SaaS) <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_3> <span class="md-nav__icon md-icon"></span> Classic (SaaS) </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-install/ class=md-nav__link> Cluster Install </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/customize-cluster-install/ class=md-nav__link> Customize Installation </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-upgrade/ class=md-nav__link> Cluster Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/cluster-delete/ class=md-nav__link> Cluster Delete </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/install-k8s/ class=md-nav__link> Kubernetes Install </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/cluster-setup/dgx-bundle/ class=md-nav__link> NVIDIA DGX Bundle </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4> <label class=md-nav__link for=__nav_2_2_4 id=__nav_2_2_4_label tabindex=0> Self-hosted <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4> <span class="md-nav__icon md-icon"></span> Self-hosted </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/overview/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_2> <label class=md-nav__link for=__nav_2_2_4_2 id=__nav_2_2_4_2_label tabindex=0> Kubernetes-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_2_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_2> <span class="md-nav__icon md-icon"></span> Kubernetes-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/k8s/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_4_3> <label class=md-nav__link for=__nav_2_2_4_3 id=__nav_2_2_4_3_label tabindex=0> OpenShift-based <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_2_2_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_4_3> <span class="md-nav__icon md-icon"></span> OpenShift-based </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/prerequisites/ class=md-nav__link> Prerequisites </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/preparations/ class=md-nav__link> Preparations </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/backend/ class=md-nav__link> Install Control Plane </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/cluster/ class=md-nav__link> Install a Cluster </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/additional-clusters/ class=md-nav__link> Install additional Clusters </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/project-management/ class=md-nav__link> Manually Create Projects </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/next-steps/ class=md-nav__link> Next Steps </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/upgrade/ class=md-nav__link> Upgrade </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/self-hosted/ocp/uninstall/ class=md-nav__link> Uninstall </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_5> <label class=md-nav__link for=__nav_2_2_5 id=__nav_2_2_5_label tabindex=0> Configuration <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_5> <span class="md-nav__icon md-icon"></span> Configuration </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/config/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/node-roles/ class=md-nav__link> Set Node Roles </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/access-roles/ class=md-nav__link> Review Kubernetes Access provided to Run:ai </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/allow-external-access-to-containers/ class=md-nav__link> External access to Containers </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/non-root-containers/ class=md-nav__link> User Identity in Container </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/cli-admin-install/ class=md-nav__link> Install Administrator CLI </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/dr/ class=md-nav__link> Disaster Recovery </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/config/node-affinity-with-cloud-node-pools/ class=md-nav__link> Node Affinity with Cloud Node Pools </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_6> <label class=md-nav__link for=__nav_2_2_6 id=__nav_2_2_6_label tabindex=0> Authentication <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_6> <span class="md-nav__icon md-icon"></span> Authentication </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/authentication-overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/researcher-authentication/ class=md-nav__link> Researcher Authentication </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/authentication/sso/ class=md-nav__link> Single Sign-On </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_2_7> <label class=md-nav__link for=__nav_2_2_7 id=__nav_2_2_7_label tabindex=0> Maintenance <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_2_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_2_7> <span class="md-nav__icon md-icon"></span> Maintenance </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/node-downtime/ class=md-nav__link> Node Downtime </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/monitoring/ class=md-nav__link> Monitoring Cluster Health </a> </li> <li class=md-nav__item> <a href=../../admin/runai-setup/maintenance/audit-log/ class=md-nav__link> Audit Log </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_3> <label class=md-nav__link for=__nav_2_3 id=__nav_2_3_label tabindex=0> Researcher Setup <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_3_label aria-expanded=false> <label class=md-nav__title for=__nav_2_3> <span class="md-nav__icon md-icon"></span> Researcher Setup </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/researcher-setup/researcher-setup-intro/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/cli-install/ class=md-nav__link> Install the CLI </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/registry-integration/ class=md-nav__link> Registry integration </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/docker-registry-config/ class=md-nav__link> Use a Docker Registry with Credentials </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/cluster-wide-pvc/ class=md-nav__link> Setup cluster wide PVC </a> </li> <li class=md-nav__item> <a href=../../admin/researcher-setup/limit-to-node-group/ class=md-nav__link> Group Nodes </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_4> <label class=md-nav__link for=__nav_2_4 id=__nav_2_4_label tabindex=0> Workloads <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_4_label aria-expanded=false> <label class=md-nav__title for=__nav_2_4> <span class="md-nav__icon md-icon"></span> Workloads </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/workloads/workload-overview-admin/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/policies/ class=md-nav__link> Policies </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/secrets/ class=md-nav__link> Secrets </a> </li> <li class=md-nav__item> <a href=../../admin/workloads/inference-overview/ class=md-nav__link> Inference </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_5> <label class=md-nav__link for=__nav_2_5 id=__nav_2_5_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_5_label aria-expanded=false> <label class=md-nav__title for=__nav_2_5> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/admin-ui-users/ class=md-nav__link> Users </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/project-setup/ class=md-nav__link> Projects </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/department-setup/ class=md-nav__link> Departments </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/dashboard-analysis/ class=md-nav__link> Dashboard Analysis </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/jobs/ class=md-nav__link> Jobs </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/credentials-setup/ class=md-nav__link> Credentials </a> </li> <li class=md-nav__item> <a href=../../admin/admin-ui-setup/deployments/ class=md-nav__link> Deployments </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_6> <label class=md-nav__link for=__nav_2_6 id=__nav_2_6_label tabindex=0> Troubleshooting <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_6_label aria-expanded=false> <label class=md-nav__title for=__nav_2_6> <span class="md-nav__icon md-icon"></span> Troubleshooting </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/troubleshooting/cluster-health-check/ class=md-nav__link> Cluster Health </a> </li> <li class=md-nav__item> <a href=../../admin/troubleshooting/troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class=md-nav__item> <a href=../../admin/troubleshooting/diagnostics/ class=md-nav__link> Diagnostics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_7> <label class=md-nav__link for=__nav_2_7 id=__nav_2_7_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_7_label aria-expanded=false> <label class=md-nav__title for=__nav_2_7> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/researcher-setup/docker-to-runai/ class=md-nav__link> From Docker to Run:ai </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_2_8> <label class=md-nav__link for=__nav_2_8 id=__nav_2_8_label tabindex=0> Integrations <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_2_8_label aria-expanded=false> <label class=md-nav__title for=__nav_2_8> <span class="md-nav__icon md-icon"></span> Integrations </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../admin/integration/jupyterhub/ class=md-nav__link> JupyterHub </a> </li> <li class=md-nav__item> <a href=../../admin/integration/airflow/ class=md-nav__link> Airflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/mlflow/ class=md-nav__link> MLflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/kubeflow/ class=md-nav__link> Kubeflow </a> </li> <li class=md-nav__item> <a href=../../admin/integration/seldon/ class=md-nav__link> Seldon Core </a> </li> <li class=md-nav__item> <a href=../../admin/integration/clearml/ class=md-nav__link> ClearML </a> </li> <li class=md-nav__item> <a href=../../admin/integration/argo-workflows/ class=md-nav__link> Argo Workflows </a> </li> <li class=md-nav__item> <a href=../../admin/integration/kubevirt/ class=md-nav__link> KubeVirt (VM) </a> </li> <li class=md-nav__item> <a href=../../admin/integration/weights-and-biases/ class=md-nav__link> Weights & Biases </a> </li> <li class=md-nav__item> <a href=../../admin/integration/messaging/ class=md-nav__link> Event Messaging </a> </li> <li class=md-nav__item> <a href=../../admin/integration/deepspeed/ class=md-nav__link> DeepSpeed </a> </li> <li class=md-nav__item> <a href=../../admin/integration/comet/ class=md-nav__link> Comet </a> </li> <li class=md-nav__item> <a href=../../admin/integration/spark/ class=md-nav__link> Spark </a> </li> <li class=md-nav__item> <a href=../../admin/integration/ray/ class=md-nav__link> Ray </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3> <label class=md-nav__link for=__nav_3 id=__nav_3_label tabindex=0> Researcher <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3> <span class="md-nav__icon md-icon"></span> Researcher </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/overview-researcher/ class=md-nav__link> Overview </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_2> <label class=md-nav__link for=__nav_3_2 id=__nav_3_2_label tabindex=0> Quickstart Guides <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_2> <span class="md-nav__icon md-icon"></span> Quickstart Guides </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-overview/ class=md-nav__link> Run:ai Quickstart Guides </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-train/ class=md-nav__link> Training </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-build/ class=md-nav__link> Build </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-build-ports/ class=md-nav__link> Build with Connected Ports </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-fractions/ class=md-nav__link> GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-distributed-training/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-overquota/ class=md-nav__link> Over-Quota, Basic Fairness & Bin-Packing </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/walkthrough-queue-fairness/ class=md-nav__link> Queue Fairness </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-inference/ class=md-nav__link> Inference </a> </li> <li class=md-nav__item> <a href=../../Researcher/Walkthroughs/quickstart-mig/ class=md-nav__link> Dynamic MIG </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3> <label class=md-nav__link for=__nav_3_3 id=__nav_3_3_label tabindex=0> User Interface <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3> <span class="md-nav__icon md-icon"></span> User Interface </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1> <label class=md-nav__link for=__nav_3_3_1 id=__nav_3_3_1_label tabindex=0> Workspaces <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_3_3_1_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1> <span class="md-nav__icon md-icon"></span> Workspaces </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/overview/ class=md-nav__link> Introduction </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_2> <label class=md-nav__link for=__nav_3_3_1_2 id=__nav_3_3_1_2_label tabindex=0> Building Blocks <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_2> <span class="md-nav__icon md-icon"></span> Building Blocks </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/building-blocks/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/environments/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/blocks/datasources/ class=md-nav__link> Data Sources </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_3_1_3> <label class=md-nav__link for=__nav_3_3_1_3 id=__nav_3_3_1_3_label tabindex=0> Creation <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_3_3_1_3_label aria-expanded=false> <label class=md-nav__title for=__nav_3_3_1_3> <span class="md-nav__icon md-icon"></span> Creation </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-env/ class=md-nav__link> Environments </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-compute/ class=md-nav__link> Compute Resources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/create-ds/ class=md-nav__link> Data Sources </a> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/create/workspace/ class=md-nav__link> Workspaces </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/workspaces/statuses/ class=md-nav__link> Statuses </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/user-interface/trainings/ class=md-nav__link> Trainings </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_4> <label class=md-nav__link for=__nav_3_4 id=__nav_3_4_label tabindex=0> CLI Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_4_label aria-expanded=false> <label class=md-nav__title for=__nav_3_4> <span class="md-nav__icon md-icon"></span> CLI Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/cli-reference/Introduction/ class=md-nav__link> Introduction </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-attach/ class=md-nav__link> runai attach </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-bash/ class=md-nav__link> runai bash </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-config/ class=md-nav__link> runai config </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-delete/ class=md-nav__link> runai delete </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-describe/ class=md-nav__link> runai describe </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-exec/ class=md-nav__link> runai exec </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-list/ class=md-nav__link> runai list </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-login/ class=md-nav__link> runai login </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-logout/ class=md-nav__link> runai logout </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-logs/ class=md-nav__link> runai logs </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-port-forwarding/ class=md-nav__link> runai port-forward </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-resume/ class=md-nav__link> runai resume </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit/ class=md-nav__link> runai submit </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-mpi/ class=md-nav__link> runai submit-dist mpi </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-pytorch/ class=md-nav__link> runai submit-dist pytorch </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-TF/ class=md-nav__link> runai submit-dist tf </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-submit-dist-xgboost/ class=md-nav__link> runai submit-dist xgboost </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-suspend/ class=md-nav__link> runai suspend </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-top-node/ class=md-nav__link> runai top node </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-update/ class=md-nav__link> runai update </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-version/ class=md-nav__link> runai version </a> </li> <li class=md-nav__item> <a href=../../Researcher/cli-reference/runai-whoami/ class=md-nav__link> runai whoami </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_5> <label class=md-nav__link for=__nav_3_5 id=__nav_3_5_label tabindex=0> Best Practices <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_5_label aria-expanded=false> <label class=md-nav__title for=__nav_3_5> <span class="md-nav__icon md-icon"></span> Best Practices </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/best-practices/bare-metal-to-docker-images/ class=md-nav__link> Bare-Metal to Docker Images </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/convert-to-unattended/ class=md-nav__link> Convert a Workload to Run Unattended </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/save-dl-checkpoints/ class=md-nav__link> Save Deep Learning Checkpoints </a> </li> <li class=md-nav__item> <a href=../../Researcher/best-practices/env-variables/ class=md-nav__link> Environment Variables </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_6> <label class=md-nav__link for=__nav_3_6 id=__nav_3_6_label tabindex=0> Scheduling <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_3_6> <span class="md-nav__icon md-icon"></span> Scheduling </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/scheduling/the-runai-scheduler/ class=md-nav__link> The Run:ai Scheduler </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/fractions/ class=md-nav__link> Allocation of GPU Fractions </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/allocation-of-cpu-and-memory/ class=md-nav__link> Allocation of CPU and Memory </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/job-statuses/ class=md-nav__link> Job Statuses </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/strategies/ class=md-nav__link> Scheduling Strategies </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/schedule-to-aws-groups/ class=md-nav__link> Scheduling workloads to AWS placement groups </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/using-node-pools/ class=md-nav__link> Using Node Pools </a> </li> <li class=md-nav__item> <a href=../../Researcher/scheduling/hpo/ class=md-nav__link> Hyperparameter Optimization </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_3_7> <label class=md-nav__link for=__nav_3_7 id=__nav_3_7_label tabindex=0> Tools <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_3_7_label aria-expanded=false> <label class=md-nav__title for=__nav_3_7> <span class="md-nav__icon md-icon"></span> Tools </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../Researcher/tools/dev-vscode/ class=md-nav__link> Visual Studio Code </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-pycharm/ class=md-nav__link> PyCharm </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-x11forward-pycharm/ class=md-nav__link> X11 & PyCharm </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-jupyter/ class=md-nav__link> Jupyter Notebook </a> </li> <li class=md-nav__item> <a href=../../Researcher/tools/dev-tensorboard/ class=md-nav__link> TensorBoard </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../Researcher/use-cases/ class=md-nav__link> Use Cases </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4> <label class=md-nav__link for=__nav_4 id=__nav_4_label tabindex=0> Developer <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_4_label aria-expanded=false> <label class=md-nav__title for=__nav_4> <span class="md-nav__icon md-icon"></span> Developer </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/overview-developer/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/rest-auth/ class=md-nav__link> API Authentication </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3> <label class=md-nav__link for=__nav_4_3 id=__nav_4_3_label tabindex=0> Cluster API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_3_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3> <span class="md-nav__icon md-icon"></span> Cluster API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/cluster-api/workload-overview-dev/ class=md-nav__link> Workloads Overview </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-yaml/ class=md-nav__link> Submit Workload via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-rest/ class=md-nav__link> Submit Workload via HTTP/REST </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/submit-cron-yaml/ class=md-nav__link> Submit CRON job via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/other-resources/ class=md-nav__link> Kubernetes Workloads </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_3_6> <label class=md-nav__link for=__nav_4_3_6 id=__nav_4_3_6_label tabindex=0> Reference <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_3_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_3_6> <span class="md-nav__icon md-icon"></span> Reference </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/training/ class=md-nav__link> Training Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/interactive/ class=md-nav__link> Interactive Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/distributed/ class=md-nav__link> Distributed Training Workloads </a> </li> <li class=md-nav__item> <a href=../../developer/cluster-api/reference/inference/ class=md-nav__link> Inference Workloads </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../../developer/admin-rest-api/overview/ class=md-nav__link> Administrator API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_5> <label class=md-nav__link for=__nav_4_5 id=__nav_4_5_label tabindex=0> Metrics API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_5_label aria-expanded=false> <label class=md-nav__title for=__nav_4_5> <span class="md-nav__icon md-icon"></span> Metrics API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/metrics/metrics/ class=md-nav__link> Metrics </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6> <label class=md-nav__link for=__nav_4_6 id=__nav_4_6_label tabindex=0> Deprecated APIs <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_4_6_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6> <span class="md-nav__icon md-icon"></span> Deprecated APIs </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1> <label class=md-nav__link for=__nav_4_6_1 id=__nav_4_6_1_label tabindex=0> Researcher API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_1_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1> <span class="md-nav__icon md-icon"></span> Researcher API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/researcher-rest-api/overview/ class=md-nav__link> REST API </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_1_2> <label class=md-nav__link for=__nav_4_6_1_2 id=__nav_4_6_1_2_label tabindex=0> Kubernetes API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=4 aria-labelledby=__nav_4_6_1_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_1_2> <span class="md-nav__icon md-icon"></span> Kubernetes API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/launch-job-via-yaml/ class=md-nav__link> Submit a Job via YAML </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/k8s-api/launch-job-via-kubernetes-api/ class=md-nav__link> Submit a Job via Kubernetes API </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_4_6_2> <label class=md-nav__link for=__nav_4_6_2 id=__nav_4_6_2_label tabindex=0> Inference API <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_4_6_2_label aria-expanded=false> <label class=md-nav__title for=__nav_4_6_2> <span class="md-nav__icon md-icon"></span> Inference API </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../developer/deprecated/inference/overview/ class=md-nav__link> Overview </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/inference/setup/ class=md-nav__link> Setup </a> </li> <li class=md-nav__item> <a href=../../developer/deprecated/inference/submit-via-cli/ class=md-nav__link> Submit via CLI </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#version-2137 class=md-nav__link> Version 2.13.7 </a> <nav class=md-nav aria-label="Version 2.13.7"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-date class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content class=md-nav__link> Release content </a> </li> <li class=md-nav__item> <a href=#fixed-issues class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#release-date_1 class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#fixed-issues_1 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#version-2131 class=md-nav__link> Version 2.13.1 </a> <nav class=md-nav aria-label="Version 2.13.1"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-date_2 class=md-nav__link> Release date </a> <nav class=md-nav aria-label="Release date"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content_1 class=md-nav__link> Release content </a> </li> <li class=md-nav__item> <a href=#fixed-issues_2 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#version-2130 class=md-nav__link> Version 2.13.0 </a> <nav class=md-nav aria-label="Version 2.13.0"> <ul class=md-nav__list> <li class=md-nav__item> <a href=#release-content_2 class=md-nav__link> Release content </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=#installation class=md-nav__link> Installation </a> <nav class=md-nav aria-label=Installation> <ul class=md-nav__list> <li class=md-nav__item> <a href=#known-issues class=md-nav__link> Known issues </a> </li> <li class=md-nav__item> <a href=#fixed-issues_3 class=md-nav__link> Fixed issues </a> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <a href=https://github.com/run-ai/docs/edit/v2.13/docs/home/whats-new-2-13.md title="Edit this page" class="md-content__button md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id=runai-version-213>Run:ai version 2.13<a class=headerlink href=#runai-version-213 title="Permanent link">&para;</a></h1> <h2 id=version-2137>Version 2.13.7<a class=headerlink href=#version-2137 title="Permanent link">&para;</a></h2> <h3 id=release-date>Release date<a class=headerlink href=#release-date title="Permanent link">&para;</a></h3> <p>July 2023</p> <h4 id=release-content>Release content<a class=headerlink href=#release-content title="Permanent link">&para;</a></h4> <!-- RUN-10803 --> <ul> <li>Added filters to the historic quota ratio widget on the <em>Quota management</em> dashboard.</li> </ul> <h4 id=fixed-issues>Fixed issues<a class=headerlink href=#fixed-issues title="Permanent link">&para;</a></h4> <table> <thead> <tr> <th>Internal ID</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td>RUN-11080</td> <td>Fixed an issue in OpenShift environments where log in via SSO with the <code>kubeadmin</code> user, gets blank pages for every page.</td> </tr> <tr> <td>RUN-11119</td> <td>Fixed an issue where values that should be the <em>Order of priority</em> column are in the wrong column.</td> </tr> <tr> <td>RUN-11120</td> <td>Fixed an issue where the <em>Projects</em> table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster.</td> </tr> <tr> <td>RUN-11121</td> <td>Fixed an issue where the wrong over quota memory alert is shown in the <em>Quota management</em> pane in project edit form.</td> </tr> <tr> <td>RUN-11272</td> <td>Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page.</td> </tr> <tr> <td>## Version 2.13.4</td> <td></td> </tr> </tbody> </table> <h3 id=release-date_1>Release date<a class=headerlink href=#release-date_1 title="Permanent link">&para;</a></h3> <p>July 2023</p> <h4 id=fixed-issues_1>Fixed issues<a class=headerlink href=#fixed-issues_1 title="Permanent link">&para;</a></h4> <table> <thead> <tr> <th>Internal ID</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td>RUN-11089</td> <td>Fixed an issue when creating an environment, commands in the <em>Runtime settings</em> pane and are not persistent and cannot be found in other assets (for example in a new <em>Training</em>).</td> </tr> </tbody> </table> <h2 id=version-2131>Version 2.13.1<a class=headerlink href=#version-2131 title="Permanent link">&para;</a></h2> <h3 id=release-date_2>Release date<a class=headerlink href=#release-date_2 title="Permanent link">&para;</a></h3> <p>July 2023</p> <h4 id=release-content_1>Release content<a class=headerlink href=#release-content_1 title="Permanent link">&para;</a></h4> <!-- RUN-11024 --> <ul> <li>Made an improvement so that occurrences of labels that are not in use anymore are deleted.</li> </ul> <h4 id=fixed-issues_2>Fixed issues<a class=headerlink href=#fixed-issues_2 title="Permanent link">&para;</a></h4> <p>N/A</p> <h2 id=version-2130>Version 2.13.0<a class=headerlink href=#version-2130 title="Permanent link">&para;</a></h2> <h3 id=release-content_2>Release content<a class=headerlink href=#release-content_2 title="Permanent link">&para;</a></h3> <p>This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes. For information about features, functionality, and fixed issues in previous versions see:</p> <ul> <li><a href=../whats-new-2-12/ >What's new 2.12</a></li> <li><a href=../whats-new-2-10/ >What's new 2.10</a></li> <li><a href=../whats-new-2-9/ >What's new 2.9</a></li> </ul> <p><strong>Projects</strong></p> <!-- RUN-9312/9313 Projects V2 --> <ul> <li>Improved the <strong>Projects</strong> UI for ease of use. <strong>Projects</strong> follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see <a href=../../admin/admin-ui-setup/project-setup/ >Projects</a>.</li> </ul> <p><strong>Dashboards</strong></p> <!-- RUN9530/9577 New Dashboard for Quota management --> <ul> <li> <p>Added a new dashboard for <strong>Quota management</strong>, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on <em>Departments</em>, <em>Projects</em>, and <em>Node pools</em>. For more information, see <a href=../../admin/admin-ui-setup/dashboard-analysis/#quota-management-dashboard>Quota management dashboard</a>.</p> </li> <li> <p>Added to the <strong>Overview dashboard</strong>, the ability to filter the cluster by one or more node pools. For more information, see <a href=../../Researcher/scheduling/using-node-pools/ >Node pools</a>.</p> </li> </ul> <p><strong>Nodes and Node pools</strong></p> <!-- RUN-9960/9961 Per node-pool GPU placement strategy --> <ul> <li> <p>Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see <a href=../../Researcher/scheduling/strategies/ >Scheduling strategies</a>. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see <a href=../../Researcher/scheduling/using-node-pools/#creating-new-node-pools>Creating new node pools</a>.</p> </li> <li> <p>GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See <a href=https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/feature-overview.html#metrics target=_blank>DCGM Metrics</a> for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.</p> </li> </ul> <!-- Hagay says no ticket number for this --> <ul> <li>Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see <a href=../../Researcher/scheduling/the-runai-scheduler/#over-quota-priority>Over-quota priority</a>.</li> </ul> <!-- RUN-9359/9360 Incorporating Node Pools in Workspaces --> <ul> <li>Added support of associating workspaces to node pool. The association between workspaces and node pools is done using <em>Compute resources</em> section. In order to associate a compute resource to a node pool, in the <em>Compute resource</em> section, press <em>More settings</em>. Press <em>Add new</em> to add more node pools to the configuration. Drag and drop the node pools to set their priority.</li> </ul> <!-- RUN-10287/10317/10313-10851 Show Node pools priority list according to workspace policy --> <ul> <li>Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</li> </ul> <p><strong>Time limit duration</strong></p> <ul> <li> <p>Improved the behavior of any workload time limit (for example, <em>Idle time limit</em>) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see <a href=../../admin/admin-ui-setup/project-setup/#limit-duration-of-interactive-and-training-jobs>Limit duration of interactive training jobs</a>.</p> </li> <li> <p>Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of <code>stopped</code> so that they can be reactivated later.</p> </li> <li> <p>Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated.</p> </li> </ul> <p><strong>Workload assets</strong></p> <!-- RUN-8862/9292 - Department as a workspace asset creation scope - phase 1 --> <ul> <li>Extended the collaboration functionality for any workload asset such as <em>Environment</em>, <em>Compute resource</em>, and some <em>Data source types</em>. These assets are now shared with <strong>Departments</strong> in the organization in addition to being shared with specific projects, or the entire cluster.</li> </ul> <!-- RUN-9364/10850 Search box for cards in V2 assets --> <ul> <li>Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.</li> </ul> <p><strong>PVC data sources</strong></p> <!-- RUN-9826/10186 Support PVC from block storage --> <ul> <li>Added support for PVC block storage in the <em>New data source</em> form. In the <em>New data source</em> form for a new PVC data source, in the <em>Volume mode</em> field, select from <em>Filesystem</em> or <em>Block</em>. For more information, see <a href=../../Researcher/user-interface/workspaces/create/create-ds/#create-a-pvc-data-source>Create a PVC data source</a>.</li> </ul> <p><strong>Credentials</strong></p> <!-- RUN-9843/9852 - Allow researcher to create docker registry secrets --> <ul> <li>Added <em>Docker registry</em> to the <em>Credentials</em> menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see <a href=../../admin/admin-ui-setup/credentials-setup/#configuring-credentials>Configuring credentials</a>.</li> </ul> <!-- RUN-8453/8454/8927 Technical documentation of 'Projects new parameters and options' use existing namespace, status, and more added to projects v2--> <p><strong>Policies</strong></p> <!-- RUN-10588/10590 Allow workload policy to prevent the use of a new pvc --> <ul> <li>Improved policy support by adding <code>DEFAULTS</code> in the <code>items</code> section in the policy. The <code>DEFAULTS</code> section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, <a href=../../admin/workloads/policies/#complex-values>Complex values</a>.</li> </ul> <!-- RUN-8904/8960 - Cluster wide PVC in workspaces --> <ul> <li>Added support for making a PVC data source available to all projects. In the <em>New data source</em> form, when creating a new PVC data source, select <em>All</em> from the <em>Project</em> pane.</li> </ul> <p><strong>Researcher API</strong></p> <!-- RUN-8631/8880 Researcher API for train jobs --> <ul> <li>Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see <a href=../../developer/cluster-api/submit-rest/ >Submitting Workloads via HTTP/REST</a>.</li> </ul> <p><strong>Integrations</strong></p> <!-- RUN-9651/9652 Schedule and support of Elastic Jobs (Spark) --> <ul> <li>Added support for Spark and Elastic jobs. For more information, see <a href=../../admin/integration/spark/ >Running Spark jobs with Run:ai</a>.</li> </ul> <!-- RUN-9024/9027 Ray Support - schedule and support of Ray Jobs --> <ul> <li> <p>Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see <a href=../../admin/integration/ray/#integrate-runai-with-ray>Integrate Run:ai with Ray</a>.</p> </li> <li> <p>Added integration with Weights &amp; Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see <a href=../../admin/integration/weights-and-biases/#sweep-configuration>Sweep configuration</a>.</p> </li> </ul> <!-- RUN-9510 --> <ul> <li>Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see <a href=../../Researcher/cli-reference/runai-submit-dist-xgboost/ >runai submit-dist xgboost</a></li> </ul> <p><strong>Compatability</strong></p> <ul> <li>Added support for multiple OpenShift clusters. For configuration information, see <a href=../../admin/runai-setup/self-hosted/ocp/additional-clusters/ >Installing additional Clusters</a>.</li> </ul> <h2 id=installation>Installation<a class=headerlink href=#installation title="Permanent link">&para;</a></h2> <ul> <li>The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.</li> <li>From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a <em>backend values</em> file. Instead, install directly using <code>helm</code> as described in <a href=../../admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane>Install the Run:ai Control Plane</a>. </li> <li>From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a <code>custom-env.yaml</code> values file during the <a href=../../admin/runai-setup/self-hosted/k8s/preparations/#prepare-installation-artifacts>preparation</a> stage. This is used when installing the <a href=../../admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane>control-plane</a>.</li> </ul> <h3 id=known-issues>Known issues<a class=headerlink href=#known-issues title="Permanent link">&para;</a></h3> <table> <thead> <tr> <th style="text-align: left;">Internal ID</th> <th style="text-align: left;">Description</th> </tr> </thead> <tbody> <tr> <td style="text-align: left;">RUN-11005</td> <td style="text-align: left;">Incorrect error messages when trying to run <code>runai</code> CLI commands in an OpenShift environment.</td> </tr> <tr> <td style="text-align: left;">RUN-11009</td> <td style="text-align: left;">Incorrect error message when a user without permissions to tries to delete another user.</td> </tr> </tbody> </table> <h3 id=fixed-issues_3>Fixed issues<a class=headerlink href=#fixed-issues_3 title="Permanent link">&para;</a></h3> <table> <thead> <tr> <th style="text-align: left;">Internal ID</th> <th style="text-align: left;">Description</th> </tr> </thead> <tbody> <tr> <td style="text-align: left;">RUN-9039</td> <td style="text-align: left;">Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible.</td> </tr> <tr> <td style="text-align: left;">RUN-9323</td> <td style="text-align: left;">Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful.</td> </tr> <tr> <td style="text-align: left;">RUN-9324</td> <td style="text-align: left;">Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready.</td> </tr> <tr> <td style="text-align: left;">RUN-9902</td> <td style="text-align: left;">Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn’t have permissions to monitor the <code>runai</code> namespace after an installation or upgrade to 2.9.</td> </tr> <tr> <td style="text-align: left;">RUN-9920</td> <td style="text-align: left;">Fixed an issue where the <code>canEdit</code> key in a policy is not validated properly for itemized fields when configuring an interactive policy.</td> </tr> <tr> <td style="text-align: left;">RUN-10052</td> <td style="text-align: left;">Fixed an issue when loading a new job from a template gives an error until there are changes made on the form.</td> </tr> <tr> <td style="text-align: left;">RUN-10053</td> <td style="text-align: left;">Fixed an issue where the Node pool column is unsearchable in the job list.</td> </tr> <tr> <td style="text-align: left;">RUN-10422</td> <td style="text-align: left;">Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.).</td> </tr> <tr> <td style="text-align: left;">RUN-10500</td> <td style="text-align: left;">Fixed an issue where jobs are shown as running even though they don't exist in the cluster.</td> </tr> <tr> <td style="text-align: left;">RUN-10813</td> <td style="text-align: left;">Fixed an issue in adding a <code>data source</code> where the path is case sensitive and didn't allow uppercase.</td> </tr> </tbody> </table> <hr> <div class=md-source-file> <small> Last update: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-07-27T08:45:30+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-07-27</span> <br> Created: <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-06-14T12:30:25+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-06-14</span> </small> </div> <!-- Giscus --> <!-- <h2 id="__comments">Comments</h2> --> <!-- Replace with generated snippet --> <script src=https://giscus.app/client.js data-repo=run-ai/docs data-repo-id="MDEwOlJlcG9zaXRvcnkyODAxODMxOTY=" data-category=General data-category-id=MDE4OkRpc2N1c3Npb25DYXRlZ29yeTE1MTA2MDU4 data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=bottom data-theme=light data-lang=en crossorigin=anonymous async>
 </script> <!-- Synchronize Giscus theme with palette --> <script>
     var giscus = document.querySelector("script[src*=giscus]")
 
diff --git a/v2.13/home/whats-new-2-8/index.html b/v2.13/home/whats-new-2-8/index.html
index 7981588b8d..890e668d5f 100644
--- a/v2.13/home/whats-new-2-8/index.html
+++ b/v2.13/home/whats-new-2-8/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-8/ rel=canonical><link href=../whats-new-2-9/ rel=prev><link href=../whats-new-2022/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Version 2.8 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-8/ rel=canonical><link href=../whats-new-2-9/ rel=prev><link href=../whats-new-2022/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Version 2.8 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2-9/index.html b/v2.13/home/whats-new-2-9/index.html
index c1fa512803..883c9c8378 100644
--- a/v2.13/home/whats-new-2-9/index.html
+++ b/v2.13/home/whats-new-2-9/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-9/ rel=canonical><link href=../whats-new-2-10/ rel=prev><link href=../whats-new-2-8/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Version 2.9 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2-9/ rel=canonical><link href=../whats-new-2-10/ rel=prev><link href=../whats-new-2-8/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Version 2.9 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2020/index.html b/v2.13/home/whats-new-2020/index.html
index 7f3dc63ef3..e0ffcebce7 100644
--- a/v2.13/home/whats-new-2020/index.html
+++ b/v2.13/home/whats-new-2020/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2020/ rel=canonical><link href=../whats-new-2021/ rel=prev><link href=../data-privacy-details/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Whats New 2020 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2020/ rel=canonical><link href=../whats-new-2021/ rel=prev><link href=../data-privacy-details/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Whats New 2020 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2021/index.html b/v2.13/home/whats-new-2021/index.html
index bfcd05b927..b00aeee164 100644
--- a/v2.13/home/whats-new-2021/index.html
+++ b/v2.13/home/whats-new-2021/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2021/ rel=canonical><link href=../whats-new-2022/ rel=prev><link href=../whats-new-2020/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Whats New 2021 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2021/ rel=canonical><link href=../whats-new-2022/ rel=prev><link href=../whats-new-2020/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Whats New 2021 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/home/whats-new-2022/index.html b/v2.13/home/whats-new-2022/index.html
index 90a682ec07..dd79e990d8 100644
--- a/v2.13/home/whats-new-2022/index.html
+++ b/v2.13/home/whats-new-2022/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2022/ rel=canonical><link href=../whats-new-2-8/ rel=prev><link href=../whats-new-2021/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Whats New 2022 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/home/whats-new-2022/ rel=canonical><link href=../whats-new-2-8/ rel=prev><link href=../whats-new-2021/ rel=next><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Whats New 2022 - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/index.html b/v2.13/index.html
index 207e2cd2cc..c971be4fd0 100644
--- a/v2.13/index.html
+++ b/v2.13/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/ rel=canonical><link href=home/components/ rel=next><link rel=icon href=images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=css/timeago.css><link rel=stylesheet href=stylesheets/extra.css><script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/ rel=canonical><link href=home/components/ rel=next><link rel=icon href=images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Overview - Run:ai Documentation Library</title><link rel=stylesheet href=assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=css/timeago.css><link rel=stylesheet href=stylesheets/extra.css><script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
diff --git a/v2.13/search/search_index.json b/v2.13/search/search_index.json
index 6eb342c694..11269055b7 100644
--- a/v2.13/search/search_index.json
+++ b/v2.13/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Run:ai Documentation Library","text":"<p>Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.</p> <p>The Run:ai documentation is targeting three personas:</p> <ul> <li> <p>Run:ai Administrator - Is responsible for the setup and the day-to-day administration of the product. Administrator documentation can be found here.</p> </li> <li> <p>Researcher - Using Run:ai to submit Jobs. Researcher documentation can be found here.</p> </li> <li> <p>Developer - Using various APIs to manipulate Jobs and integrate with other systems. Developer documentation can be found here.</p> </li> </ul>"},{"location":"#how-to-get-support","title":"How to get support","text":"<p>To get support use the following channels:</p> <ul> <li> <p>On the Run:ai user interface at <code>&lt;company-name&gt;.run.ai</code>, use the 'Contact Support' link on the top right.</p> </li> <li> <p>Or submit a ticket by clicking the button below:</p> </li> </ul> <p>Submit a Ticket</p>"},{"location":"#community","title":"Community","text":"<p>Run:ai provides its customers with access to the Run:ai Customer Community portal in order to submit tickets, track ticket progress and update support cases.</p> <p>Customer Community Portal</p> <p>Reach out to customer support for credentials.</p>"},{"location":"#runai-cloud-status-page","title":"Run:ai Cloud Status Page","text":"<p>Run:ai cloud availability is monitored at status.run.ai.</p>"},{"location":"#collect-logs-to-send-to-support","title":"Collect Logs to Send to Support","text":"<p>As an IT Administrator, you can collect Run:ai logs to send to support:</p> <ul> <li>Install the Run:ai Administrator command-line interface.</li> <li>Run <code>runai-adm collect-logs</code>. The command will generate a compressed file containing all of the existing Run:ai log files.</li> </ul> <p>Note</p> <p>The tar file packages the logs of Run:ai components only. It does not include logs of researcher containers that may contain private information. </p>"},{"location":"#example-code","title":"Example Code","text":"<p>Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.</p> <p>The following images are used throughout the documentation:</p> Image Description Source gcr.io/run-ai-demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main gcr.io/run-ai-demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx gcr.io/run-ai-demo/quickstart-hpo Hyperparameter Optimization https://github.com/run-ai/docs/tree/master/quickstart/hpo gcr.io/run-ai-demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding gcr.io/run-ai-demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh gcr.io/run-ai-demo/example-triton-client and  gcr.io/run-ai-demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton"},{"location":"#contributing-to-the-documentation","title":"Contributing to the documentation","text":"<p>This documentation is made better by a number of individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page. </p>"},{"location":"Researcher/overview-researcher/","title":"Overview: Researcher Documentation","text":"<p>Researchers use Run:ai to submit jobs. </p> <p>As part of the Researcher documentation you will find:</p> <ul> <li>Quickstart Guides which provide step-by-step guides to Run:ai technology.</li> <li>Command line interface reference documentation.</li> <li>Best Practices for Deep Learning with Run:ai.</li> <li>Information about the Run:ai Scheduler.</li> <li>The Run:ai Python Researcher Library which you can optionally use in your container to get additional reporting and further resource optimization.</li> <li>Using Run:ai with various developer tools. </li> </ul>"},{"location":"Researcher/use-cases/","title":"Use Cases","text":"<p>This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.</p> <p>Note</p> <p>For the most up-to-date information, check out the official Run:ai use-cases GitHub page.  </p> <ul> <li>MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.  </li> <li>Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.  </li> <li>Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.  </li> <li>Persistent Environments (with Conda/Mamba &amp; Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.  </li> <li>Weights &amp; Biases with Run:ai: W&amp;B (Weights &amp; Biases) is one of the best tools for experiment tracking and management. W&amp;B is an official Run:ai partner. In this tutorial, we will demo how to use W&amp;B alongside Run:ai</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/","title":"Quickstart: Launch an Inference Workload","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#introduction","title":"Introduction","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster. There are additional prerequisites for running inference. See cluster installation prerequisites for more information. </li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> <li>You must have ML Engineer access rights. See Adding, Updating and Deleting Users for more information. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#run-an-inference-workload","title":"Run an Inference Workload","text":"<ul> <li>In the Run:ai user interface go to <code>Deployments</code>. If you do not see the <code>Deployments</code> section you may not have the required access control, or the inference module is disabled. </li> <li>Select <code>New Deployment</code> on the top right.</li> <li>Select <code>team-a</code> as a project and add an arbitrary name. Use the image <code>gcr.io/run-ai-demo/example-triton-server</code>.</li> <li>Under <code>Resources</code> add 0.5 GPUs.</li> <li>Under <code>Auto Scaling</code> select a minimum of 1, a maximum of 2. Use the <code>concurrency</code> autoscaling threshold method. Add a threshold of 3.</li> <li>Add a <code>Container port</code> of <code>8000</code>.</li> </ul> <p>This would start an inference workload for team-a with an allocation of a single GPU. Follow up on the Job's progress using the Deployment list in the user interface or by running <code>runai list jobs</code></p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#query-the-inference-server","title":"Query the Inference Server","text":"<p>The specific inference server we just created is accepting queries over port 8000. You can use the Run:ai Triton demo client to send requests to the server:</p> <ul> <li>Find an IP address by running <code>kubectl get svc -n runai-team-a</code>. Use the <code>inference1-00001-private</code> Cluster IP.</li> <li>Replace <code>&lt;IP&gt;</code> below and run: </li> </ul> <pre><code> runai submit inference-client  -i gcr.io/run-ai-demo/example-triton-client \\\n    -- perf_analyzer -m inception_graphdef  -p 3600000 -u  &lt;IP&gt;\n</code></pre> <ul> <li>To see the result, run the following:</li> </ul> <pre><code>runai logs inference-client\n</code></pre>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under Deployments you can view the new Workload. When clicking the workload, note the utilization graphs go up. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#stop-workload","title":"Stop Workload","text":"<p>Use the user interface to delete the workload.</p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#see-also","title":"See also","text":"<ul> <li>You can also create Inference deployments via API. For more information see Submitting Workloads via YAML.</li> <li>See Deployment user interface.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/","title":"Quickstart: Launch Workloads with NVIDIA Dynamic MIG","text":""},{"location":"Researcher/Walkthroughs/quickstart-mig/#introduction","title":"Introduction","text":"<p>A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power. </p> <p>This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization. </p> <p>Run:ai provides two alternatives for splitting GPUs: Fractions and Dynamic MIG allocation. The focus of this article is Dynamic MIG allocation.  A detailed explanation of the two Run:ai offerings can be found here.</p>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> <li>A machine with a single available NVIDIA A100 GPU. This can be achieved by allocating filler workloads to the other GPUs on the node, or by using Google Cloud which allows for the creation of a virtual node with a single A100 GPU. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-mig/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Allocate 2 GPUs to the Project.</li> <li>Mark the node as a dynamic MIG node as described here.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#run-an-inference-workload-single-replica","title":"Run an Inference Workload - Single Replica","text":"<p>At the GPU node level, run: <code>nvidia-smi</code>:</p> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   32C    P0    42W / 400W |      0MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| MIG devices:                                                                |\n+------------------+----------------------+-----------+-----------------------+\n| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |\n|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|\n|                  |                      |        ECC|                       |\n|==================+======================+===========+=======================|\n|  No MIG devices found                                                       |\n+-----------------------------------------------------------------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                                  |\n|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n|        ID   ID                                                   Usage      |\n|=============================================================================|\n|  No running processes found                                                 |\n+-----------------------------------------------------------------------------+\n</code></pre> <p>In the highlighted text above, note that:</p> <ul> <li>MIG is enabled (if <code>Enabled</code> has a star next to it, you need to reboot your machine).</li> <li>The GPU is not yet divided into devices.</li> </ul> <p>At the command-line run:</p> <pre><code>runai config project team-a\nrunai submit mig1 -i gcr.io/run-ai-demo/quickstart-cuda  --gpu-memory 10GB\nrunai submit mig2 -i gcr.io/run-ai-demo/quickstart-cuda  --mig-profile 2g.10gb \nrunai submit mig3 -i gcr.io/run-ai-demo/quickstart-cuda  --mig-profile 2g.10gb \n</code></pre> <p>We used two different methods to create MIG partitions: </p> <ol> <li>Stating the amount of GPU memory we require </li> <li>Requiring a partition of explicit size using NVIDIA terminology. </li> </ol> <p>Both methods achieve the same effect. They result in three MIG partitions of 10GB each. You can verify that by running <code>nvidia-smi</code>, at the GPU node level:</p> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   47C    P0   194W / 400W |  27254MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| MIG devices:                                                                |\n+------------------+----------------------+-----------+-----------------------+\n| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |\n|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|\n|                  |                      |        ECC|                       |\n|==================+======================+===========+=======================|\n|  0    3   0   0  |   9118MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      4MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n|  0    4   0   1  |   9118MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      4MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n|  0    5   0   2  |   9016MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      2MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                                  |\n|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n|        ID   ID                                                   Usage      |\n|=============================================================================|\n|    0    3    0     142213      C   ./quickstart                     9111MiB |\n|    0    4    0     146799      C   ./quickstart                     9111MiB |\n|    0    5    0     132219      C   ./quickstart                     9009MiB |\n+-----------------------------------------------------------------------------+\n</code></pre> <ul> <li>Highlighted above is a list of 3 MIG devices, each 10GB large. Total of 30GB (out of the 40GB on the GPU)</li> <li>You can also run the same command inside one of the containers: <code>runai exec mig1 nvidia-smi</code>. This will show a single device (the only one that the container sees from its point of view).</li> <li>Run: <code>runai list</code> to see the 3 jobs in <code>Running</code> state.</li> </ul> <p>We now want to allocate an interactive job with 20GB. Interactive jobs take precedence over the default training jobs:</p> <p><pre><code>runai submit mig1-int -i gcr.io/run-ai-demo/quickstart-cuda \\\n    --interactive --gpu-memory 20G \n</code></pre> or similarly, <pre><code>runai submit mig1-int -i gcr.io/run-ai-demo/quickstart-cuda \\\n    --interactive --mig-profile 3g.20gb  \n</code></pre></p> <p>Using <code>runai list</code> and <code>nvidia-smi</code> on the host machine, you can see that:</p> <ul> <li>One training job is preempted, and its device is deleted.</li> <li>The new, interactive job starts running.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-overview/","title":"Run:ai Quickstart Guides","text":"<p>Below are a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form. Follow the Quickstart documents below to learn more:</p> <ul> <li>Unattended training sessions</li> <li>Interactive build sessions</li> <li>Interactive build sessions with externalized services</li> <li>Using GPU Fractions</li> <li>Distributed Training</li> <li>Hyperparameter Optimization</li> <li>Over-Quota, Basic Fairness &amp; Bin Packing</li> <li>Fairness</li> <li>Inference</li> <li>Dynamic MIG</li> </ul> <p>Most quickstarts rely on an image called <code>gcr.io/run-ai-demo/quickstart</code>. The image is based on  TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability. </p> <p>If your GPUs do not meet these requirements, use <code>gcr.io/run-ai-demo/quickstart:legacy</code> instead. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/","title":"Quickstart: Launch Interactive Build Workloads with Connected Ports","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#introduction","title":"Introduction","text":"<p>This Quickstart is an extension of the Quickstart document: Start and Use Interactive Build Workloads </p> <p>When starting a container with the Run:ai Command-Line Interface (CLI), it is sometimes needed to expose internal ports to the user. Examples are: accessing a Jupyter notebook, using the container from a development environment such as PyCharm. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#exposing-a-container-port","title":"Exposing a Container Port","text":"<p>There are three ways to expose ports in Kubernetes: Port Forwarding, NodePort, and LoadBalancer. The first two will always work. The other requires a special setup by your administrator. The four methods are explained here. </p> <p>The document below provides an example based on Port Forwarding.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#port-forwarding-step-by-step-walkthrough","title":"Port Forwarding, Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named <code>team-a</code>.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#run-workload","title":"Run Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a\nrunai submit nginx-test -i zembutsu/docker-sample-nginx --interactive \\\n--service-type portforward --port 8080:80 </code></pre> <ul> <li>The Job is based on a sample NGINX webserver docker image <code>zembutsu/docker-sample-nginx</code>. Once accessed via a browser, the page shows the container name. </li> <li>Note the interactive flag which means the Job will not have a start or end. It is the Researcher's responsibility to close the Job.  </li> <li>In this example, we have chosen the simplest scheme to expose ports which is port forwarding. We temporarily expose port 8080 to localhost as long as the <code>runai submit</code> command is not stopped</li> <li>It is possible to forward traffic from multiple IP addresses by using the \"--address\" parameter. Check the CLI reference for further details. </li> </ul> <p>The result will be:</p> <pre><code>The job 'nginx-test-0' has been submitted successfully\nYou can run `runai describe job nginx-test-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0023] Job started\nOpen access point(s) to service from localhost:8080\nForwarding from 127.0.0.1:8080 -&gt; 80\nForwarding from [::1]:8080 -&gt; 80\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#access-the-webserver","title":"Access the Webserver","text":"<p>Open the following in the browser at http://localhost:8080.</p> <p>You should see a web page with the name of the container.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#stop-workload","title":"Stop Workload","text":"<p>Press Ctrl-C in the shell to stop port forwarding. Then delete the Job by running <code>runai delete job nginx-test</code></p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#see-also","title":"See Also","text":"<ul> <li>Develop on Run:ai using Visual Studio Code</li> <li>Develop on Run:ai using PyCharm</li> <li>Use a Jupyter notbook with Run:ai.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/","title":"Quickstart: Launch Interactive Build Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#introduction","title":"Introduction","text":"<p>Deep learning workloads can be divided into two generic types:</p> <ul> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly. </li> <li>Unattended \"training\" sessions. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the customer can examine the results.</li> </ul> <p>With this Quickstart you will learn how to:</p> <ul> <li>Use the Run:ai command-line interface (CLI) to start a deep learning Build workload</li> <li>Open an ssh session to the Build workload</li> <li>Stop the Build workload</li> </ul> <p>It is also possible to open ports to specific services within the container. See \"Next Steps\" at the end of this article.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#step-by-step-quickstart","title":"Step by Step Quickstart","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#run-workload","title":"Run Workload","text":"<ul> <li> <p>At the command-line run:</p> <pre><code>runai config project team-a\nrunai submit build1 -i ubuntu -g 1 --interactive -- sleep infinity\n</code></pre> </li> <li> <p>The job is based on a sample docker image <code>ubuntu</code></p> </li> <li>We named the job build1.</li> <li>Note the interactive flag which means the job will not have a start or end. It is the Researcher's responsibility to close the job. </li> <li>The job is assigned to team-a with an allocation of a single GPU. </li> <li>The command provided is <code>sleep infinity</code>. You must provide a command or the container will start and then exit immediately. Alternatively, replace these flags with <code>--attach</code> to attach immediately to a session.</li> </ul> <p>Follow up on the job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Typical statuses you may see:</p> <ul> <li>ContainerCreating - The docker container is being downloaded from the cloud repository</li> <li>Pending - the job is waiting to be scheduled</li> <li>Running - the job is running</li> </ul> <p>A full list of Job statuses can be found here</p> <p>To get additional status on your job run:</p> <pre><code>runai describe job build1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#get-a-shell-to-the-container","title":"Get a Shell to the container","text":"<p>Run:</p> <pre><code>runai bash build1\n</code></pre> <p>This should provide a direct shell into the computer</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under \"Jobs\" you can view the new Workload:</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#stop-workload","title":"Stop Workload","text":"<p>Run the following:</p> <pre><code>runai delete job build1\n</code></pre> <p>This would stop the training workload. You can verify this by running <code>runai list jobs</code> again.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#next-steps","title":"Next Steps","text":"<ul> <li>Expose internal ports to your interactive build workload: Quickstart Launch an Interactive Build Workload with Connected Ports.</li> <li>Follow the Quickstart document: Launch Unattended Training Workloads.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/","title":"Quickstart: Launch Distributed Training Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#introduction","title":"Introduction","text":"<p>Distributed Training is the ability to split the training of a model among multiple processors. Each processor is called a worker node. Worker nodes work in parallel to speed up model training. Distributed Training should not be confused with multi-GPU training. Multi-GPU training is the allocation of more than a single GPU to your workload which runs on a single container.</p> <p>Getting Distributed Training to work is more complex than multi-GPU training as it requires syncing of data and timing between the different workers. However, it is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Several Deep Learning frameworks support Distributed Training. Horovod is a good example.</p> <p>Run:ai provides the ability to run, manage, and view Distributed Training workloads. The following is a Quickstart document for such a scenario.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>During the installation, you have installed the Kubeflow MPI Operator as specified here</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#run-training-distributed-workload","title":"Run Training Distributed Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a\nrunai submit-dist mpi --processes=2 -g 1 \\\n-i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n</code></pre> <ul> <li>We named the Job dist</li> <li>The Job is assigned to team-a</li> <li>There will be two worker processes (--processes=2), each allocated with a single GPU (-g 1)</li> <li>The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart-distributed:v0.3.0</code>.</li> <li>The image contains a startup script that runs a deep learning Horovod-based workload.</li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>    runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>The Run:ai scheduler ensures that all processes can run together. You can see the list of workers as well as the main \"launcher\" process by running:</p> <pre><code>    runai describe job dist\n</code></pre> <p>You will see two worker processes (pods) their status and on which node they run:</p> <p></p> <p>To see the merged logs of all pods run:</p> <pre><code>    runai logs dist\n</code></pre> <p>Finally, you can delete the distributed training workload by running:</p> <pre><code>    runai delete job dist\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#run-an-interactive-distributed-workload","title":"Run an Interactive Distributed Workload","text":"<p>It is also possible to run a distributed training Job as \"interactive\". This is useful if you want to test your distributed training Job before committing on a long, unattended training session. To run such a session use:</p> <pre><code>runai submit-dist mpi dist-int --processes=2 -g 1 \\\n-i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 --interactive \\\n-- sh -c \"sleep infinity\" </code></pre> <p>When the workers are running run:</p> <pre><code>    runai bash dist-int\n</code></pre> <p>This will provide shell access to the launcher process. From there, you can run your distributed workload. For Horovod version smaller than 0.17.0 run:</p> <pre><code>horovodrun -np $RUNAI_MPI_NUM_WORKERS \\\npython scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \\\n--model=resnet20 --num_batches=1000000 --data_name cifar10 \\\n--data_dir /cifar10 --batch_size=64 --variable_update=horovod\n</code></pre> <p>For Horovod version 0.17.0 or later, add the <code>-hostfile</code> flag as follows:</p> <pre><code>horovodrun -np $RUNAI_MPI_NUM_WORKERS -hostfile /etc/mpi/hostfile \\\npython scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \\\n--model=resnet20 --num_batches=1000000 --data_name cifar10 \\\n--data_dir /cifar10 --batch_size=64 --variable_update=horovod </code></pre> <p>The environment variable <code>RUNAI_MPI_NUM_WORKERS</code> is passed by Run:ai and contains the number of worker processes provided to the <code>runai submit-dist mpi</code> command (in the above example the value is 2).</p>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#see-also","title":"See Also","text":"<ul> <li>The source code of the image used in this Quickstart document is in Github</li> <li>For a full list of the <code>submit-dist mpi</code> options see runai submit-dist mpi</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/","title":"Quickstart: Launch Workloads with GPU Fractions","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#introduction","title":"Introduction","text":"<p>Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU, enabling companies to run more workloads such as computer vision, voice recognition and natural language processing on the same hardware, lowering costs.</p> <p>Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. This enables several workloads to run in containers side-by-side on the same GPU without interfering with each other. The solution is transparent, simple, and portable; it requires no changes to the containers themselves.</p> <p>A typical use-case could see 2-8 Jobs running on the same GPU, meaning you could do eight times the work with the same hardware. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 1 GPU to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#run-workload","title":"Run Workload","text":"<ul> <li> <p>At the command-line run:</p> <pre><code>runai config project team-a\n\nrunai submit frac05 -i gcr.io/run-ai-demo/quickstart -g 0.5 --interactive\nrunai submit frac03 -i gcr.io/run-ai-demo/quickstart -g 0.3\n</code></pre> </li> <li> <p>The Jobs are based on a sample docker image <code>gcr.io/run-ai-demo/quickstart</code> the image contains a startup script that runs a deep learning TensorFlow-based workload.</p> </li> <li>We named the Jobs frac05 and frac03 respectively. </li> <li>Note that fractions may or may not use the <code>--interactive</code> flag. Setting the flag means that the Job will not automatically finish. Rather, it is the Researcher's responsibility to delete the Job. Fractions support both Interactive and non-interactive Jobs. </li> <li>The Jobs are assigned to team-a with an allocation of a single GPU. </li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Note that both Jobs were allocated to the same node.</p> <p>When both Jobs are running, bash into one of them:</p> <pre><code>runai bash frac05\n</code></pre> <p>Now, inside the container,  run: </p> <pre><code>nvidia-smi\n</code></pre> <p>The result:</p> <p></p> <p>Notes:</p> <ul> <li>The total memory is circled in red. It should be 50% of the GPUs memory size. In the picture above we see 8GB which is half of the 16GB of Tesla V100 GPUs.</li> <li>The script running on the container is limited by 8GB. In this case, TensorFlow, which tends to allocate almost all of the GPU memory has allocated 7.7GB RAM (and not close to 16 GB). Overallocation beyond 8GB will lead to an out-of-memory exception </li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#use-exact-gpu-memory","title":"Use Exact GPU Memory","text":"<p>Instead of requesting a fraction of the GPU, you can ask for specific GPU memory requirements. For example:</p> <pre><code>runai submit  -i gcr.io/run-ai-demo/quickstart --gpu-memory 5G\n</code></pre> <p>Which will provide 5GB of GPU memory. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/","title":"Quickstart: Hyperparameter Optimization","text":""},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#introduction","title":"Introduction","text":"<p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter can be a parameter whose value is used to control the learning process, to define the model architecture or the data pre-processing process, etc. Example hyperparameters: learning rate, batch size, different optimizers, number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine results to decide what works best.</p> <p>There are several strategies for searching the hyperparameter space. Most notable are Random search and Grid search. The former, as its name implies, selects parameters at random while the latter does an exhaustive search from a list of pre-selected values.</p> <p>Run:ai provides the ability to run, manage, and view HPO runs. The following is a Quickstart of such a scenario.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> <li>On shared storage create a library to store HPO results. E.g. <code>/nfs/john/hpo</code>.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#pods","title":"Pods","text":"<p>With HPO, we introduce the concept of Pods. Pods are units of work within a Job. </p> <ul> <li>Typically, each Job has a single Pod. However, with HPO as well as with Distributed Training there are multiple Pods per Job. </li> <li>Pods are independent</li> <li>All Pods execute with the same arguments as added via <code>runai submit</code>. E.g. The same image name, the same code script, the same number of Allocated GPUs, and memory.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#hpo-sample-code","title":"HPO Sample Code","text":"<p>The Quickstart code uses the Run:ai HPO python library github.com/run-ai/docs. And needs to be installed within the image. Below are some highlights of the code: </p> <pre><code># import Run:ai HPO library\nimport runai.hpo\n# select Random search or grid search\nstrategy = runai.hpo.Strategy.GridSearch\n# initialize the Run:ai HPO library. Send the NFS directory used for sync\nrunai.hpo.init(\"/nfs\")\n# pick a configuration for this HPO experiment\n# we pass the options of all hyperparameters we want to test\n# `config` will hold a single value for each parameter\nconfig = runai.hpo.pick(\ngrid=dict(\nbatch_size=[32, 64, 128],\nlr=[1, 0.1, 0.01, 0.001]),\nstrategy=strategy)\n....\n# Use the selected configuration within your code\noptimizer = keras.optimizers.SGD(lr=config['lr'])\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#run-an-hpo-workload","title":"Run an HPO Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a \nrunai submit hpo1 -i gcr.io/run-ai-demo/quickstart-hpo -g 1 \\\n    --parallelism 3 --completions 12 -v /nfs/john/hpo:/nfs\n</code></pre> <ul> <li>We named the Job hpo1</li> <li>The Job is assigned to team-a</li> <li>The Job will be complete when 12 pods will run (--completions 12), each allocated with a single GPU (-g 1)</li> <li>At most, there will be 3 pods running concurrently (--parallelism 3)</li> <li>The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart-hpo</code>. The image contains a startup script that selects a set of hyperparameters and then uses them, as described above. </li> <li>The command maps a shared volume <code>/nfs/john/hpo</code> to a directory in the container <code>/nfs</code>. The running pods will use the directory to sync hyperparameters and save results.</li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Follow up on the Job's pods by running:</p> <pre><code>runai describe job hpo1 \n</code></pre> <p>You will see 3 running pods currently executing:</p> <p></p> <p>Once the 3 pods are done, they will be replaced by new ones from the 12 completions. This process will continue until all 12 have run.</p> <p>You can also submit Jobs on another Project until only 2 GPUs remain. This will preempt 1 pod and will henceforth limit the HPO Job to run on 2 pods only. Preempted pods will be picked up and ran later.</p> <p>You can see logs of specific pods by running :</p> <pre><code>runai logs hpo1 --pod &lt;POD-NAME&gt;\n</code></pre> <p>where <code>&lt;&lt;POD-NAME&gt;&gt;</code> is a pod name as appears above in the <code>runai describe job hpo1</code> output </p> <p>The logs will contain a couple of lines worth noting:</p> <p>Picked HPO experiment #4</p> <p>...</p> <p>Using HPO directory /hpo</p> <p>Using configuration: {'batch_size': 32, 'lr': 0.001}</p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#examine-the-results","title":"Examine the Results","text":"<p>The Run:ai HPO library saves the experiment variations and the experiment results to a single file, making it easier to pick the best HPO run. The file can be found in the shared folder. Below is a snapshot of the file for two experiments with two epochs each:</p> <pre><code>creationTime: 24/08/2020 08:50:06\nexperiments:\n- config:\nbatch_size: 32\nlr: 1\nid: 1\nmodificationTime: 24/08/2020 08:50:06\nreports:\n- epoch: 0\nmetrics:\nacc: 0.09814\nloss: 2.310984723968506\nval_acc: 0.1\nval_loss: 2.3098626373291014\nreportTime: 24/08/2020 08:52:11\n- epoch: 1\nmetrics:\nacc: 0.09914\nloss: 2.30994320602417\nval_acc: 0.1\nval_loss: 2.3110838134765626\nreportTime: 24/08/2020 08:54:10\n- config:\nbatch_size: 32\nlr: 0.1\nid: 2\nmodificationTime: 24/08/2020 08:50:36\nreports:\n- epoch: 0\nmetrics:\nacc: 0.11012\nloss: 2.2979678358459474\nval_acc: 0.1667\nval_loss: 2.268467852783203\nreportTime: 24/08/2020 08:52:44\n- epoch: 1\nmetrics:\nacc: 0.2047\nloss: 2.0894255745697023\nval_acc: 0.2833\nval_loss: 1.8615504817962647\nreportTime: 24/08/2020 08:54:45\n</code></pre> <p>Finally, you can delete the HPO Job by running:</p> <pre><code>    runai delete job hpo1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#see-also","title":"See Also","text":"<p>For further information on the Run:ai HPO support library see:</p> <ul> <li>The Run:ai HPO Support Library</li> <li>Sample code in Github</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/","title":"Quickstart: Over-Quota and Bin Packing","text":""},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#goals","title":"Goals","text":"<p>The goal of this Quickstart is to explain the concepts of over-quota and bin-packing (consolidation) and how they help in maximizing cluster utilization: </p> <ul> <li>Show the simplicity of resource provisioning, and how resources are abstracted from users.</li> <li>Show how the system eliminates compute bottlenecks by allowing teams/users to go over their resource quota if there are free GPUs in the cluster.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#setup-and-configuration","title":"Setup and configuration:","text":"<ul> <li>4 GPUs on 2 machines with 2 GPUs each</li> <li>2 Projects: team-a and team-b with 2 allocated GPUs each</li> <li>Run:ai canonical image gcr.io/run-ai-demo/quickstart</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-i-over-quota","title":"Part I: Over-quota","text":"<p>Run the following commands:</p> <pre><code>runai submit a2 -i gcr.io/run-ai-demo/quickstart -g 2 -p team-a\nrunai submit a1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit b1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>team-a has 3 GPUs allocated. Which is over its quota by 1 GPU. </li> <li>The system allows this over-quota as long as there are available resources</li> <li>The system is at full capacity with all GPUs utilized. </li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-2-basic-fairness-via-preemption","title":"Part 2: Basic Fairness via Preemption","text":"<p>Run the following command:</p> <pre><code>runai submit b2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>team-a can no longer remain in over-quota. Thus, one Job, must be preempted: moved out to allow team-b to grow.</li> <li>Run:ai scheduler chooses to preempt Job a1.</li> <li>It is important that unattended Jobs will save checkpoints. This will ensure that whenever Job a1 resume, it will do so from where it left off.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-3-bin-packing","title":"Part 3: Bin Packing","text":"<p>Run the following command:</p> <pre><code>runai delete job a2 -p team-a\n</code></pre> <p>a1 is now going to start running again.</p> <p>Run:</p> <pre><code>runai list jobs -A\n</code></pre> <p>You have two Jobs that are running on the first node and one Job that is running alone the second node. </p> <p>Choose one of the two Jobs from the full node and delete it:</p> <pre><code>runai delete job &lt;job-name&gt; -p &lt;project&gt;\n</code></pre> <p>The status now is: </p> <p>Now, run a 2 GPU Job:</p> <pre><code>runai submit a2 -i gcr.io/run-ai-demo/quickstart -g 2 -p team-a\n</code></pre> <p>The status now is: </p> <p>Discussion</p> <p>Note that Job a1 has been preempted and then restarted on the second node, in order to clear space for the new a2 Job. This is bin-packing or consolidation</p>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/","title":"Quickstart: Queue Fairness","text":""},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#goal","title":"Goal","text":"<p>The goal of this Quickstart is to explain fairness. The over-quota Quickstart shows basic fairness where allocated GPUs per Project are adhered to such that if a Project is in over-quota, its Job will be preempted once another Project requires its resources.</p> <p>This Quickstart is about queue fairness. It shows that Jobs will be scheduled fairly regardless of the time they have been submitted. As such, if a person in Project A has submitted 50 Jobs and soon after that, a person in Project B has submitted 25 Jobs, the Jobs in the queue will be processed fairly.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#setup-and-configuration","title":"Setup and configuration:","text":"<ul> <li>4 GPUs on 2 machines with 2 GPUs each.</li> <li>2 Projects: team-a and team-b with 1 allocated GPU each.</li> <li>Run:ai canonical image gcr.io/run-ai-demo/quickstart</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-i-immediate-displacement-of-over-quota","title":"Part I: Immediate Displacement of Over-Quota","text":"<p>Run the following commands:</p> <pre><code>runai submit a1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a3 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a4 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <p>team-a, even though it has a single GPU as quota, is now using all 4 GPUs.</p> <p>Run the following commands:</p> <pre><code>runai submit b1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b3 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b4 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>Two team-b Jobs have immediately displaced team-a. </li> <li>team-a and team-b each have a quota of 1 GPU, thus the remaining over-quota (2 GPUs) is distributed equally between the Projects.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-2-queue-fairness","title":"Part 2: Queue Fairness","text":"<p>Now lets start deleting Jobs. Alternatively, you can wait for Jobs to complete.</p> <pre><code>runai delete job b2 -p team-b\n</code></pre> <p>Discussion</p> <p>As the quotas are equal (1 for each Project, the remaining pending Jobs will get scheduled one by one alternating between Projects, regardless of the time in which they were submitted. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/","title":"Quickstart: Launch Unattended Training Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-train/#introduction","title":"Introduction","text":"<p>Deep learning workloads can be divided into two generic types:</p> <ul> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly.</li> <li>Unattended \"training\" sessions. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the customer can examine the results.</li> </ul> <p>With this Quickstart you will learn how to:</p> <ul> <li>Use the Run:ai command-line interface (CLI) to start a deep learning training workload.</li> <li>View training status and resource consumption using the Run:ai user interface and the Run:ai CLI.</li> <li>View training logs.</li> <li>Stop the training.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-train/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#run-workload","title":"Run Workload","text":"<ul> <li>At the command-line run:<pre><code>runai config project team-a\nrunai submit train1 -i gcr.io/run-ai-demo/quickstart -g 1\n</code></pre> </li> </ul> <p>This would start an unattended training Job for team-a with an allocation of a single GPU. The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart</code>. We named the Job <code>train1</code></p> <ul> <li>Follow up on the Job's progress by running:<pre><code>runai list jobs\n</code></pre> </li> </ul> <p>The result:</p> <p></p> <p>Typical statuses you may see:</p> <ul> <li>ContainerCreating - The docker container is being downloaded from the cloud repository</li> <li>Pending - the Job is waiting to be scheduled</li> <li>Running - the Job is running</li> <li>Succeeded - the Job has ended</li> </ul> <p>A full list of Job statuses can be found here </p> <p>To get additional status on your Job run:</p> <pre><code>runai describe job train1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#view-logs","title":"View Logs","text":"<p>Run the following:</p> <pre><code>runai logs train1\n</code></pre> <p>You should see a log of a running deep learning session:</p> <p></p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under \"Jobs\" you can view the new Workload:</li> </ul> <p>The image we used for training includes the Run:ai Training library. Among other features, this library allows the reporting of metrics from within the deep learning Job. Metrics such as progress, accuracy, loss, and epoch and step numbers.  </p> <ul> <li>Progress can be seen in the status column above. </li> <li>To see other metrics, press the settings wheel on the top right  and select additional deep learning metrics from the list</li> </ul> <p>Under Nodes you can see node utilization:</p> <p></p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#stop-workload","title":"Stop Workload","text":"<p>Run the following:</p> <pre><code>runai delete job train1\n</code></pre> <p>This would stop the training workload. You can verify this by running <code>runai list jobs</code> again.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#next-steps","title":"Next Steps","text":"<ul> <li>Follow the Quickstart document: Launch Interactive Workloads</li> <li>Use your container to run an unattended training workload</li> </ul>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/","title":"Best Practice: From Bare Metal to Docker Images","text":""},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#introduction","title":"Introduction","text":"<p>Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.</p> <p>This is the fastest way to start working, but it introduces problems when the data science organization scales:</p> <ul> <li>More Researchers mean that the machine resources need to be efficiently shared</li> <li>Researchers need to collaborate and share data, code, and results</li> </ul> <p>To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#why-use-docker-images","title":"Why Use Docker Images?","text":"<p>Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.</p> <p>When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#moving-a-data-science-environment-to-docker","title":"Moving a Data Science Environment to Docker","text":"<p>A data science environment typically includes:</p> <li>Training data</li> <li>Machine Learning (ML) code and inputs</li> <li>Libraries: Code dependencies that must be installed before the ML code can be run</li>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-data","title":"Training data","text":"<p>Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.</p> <p>The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines. </p> <p>Organizations without a shared file system typically write scripts to copy data from machine to machine.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#machine-learning-code-and-inputs","title":"Machine Learning Code and Inputs","text":"<p>As a rule, code needs to be saved and versioned in a code repository.</p> <p>There are two alternative practices:</p> <ul> <li>The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.</li> <li>When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container. </li> </ul> <p>Both practices are valid.</p> <p>Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#code-dependencies","title":"Code Dependencies","text":"<p>Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.</p> <p>ML Code is typically python and python dependencies are typically declared together in a single <code>requirements.txt</code> file which is saved together with the code.</p> <p>The best practice is to have your docker startup script (see below) run this file using <code>pip install -r requirements.txt</code>. This allows the flexibility of adding and removing code dependencies dynamically.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#ml-lifecycle-build-and-train","title":"ML Lifecycle: Build and Train","text":"<p>Deep learning workloads can be divided into two generic types:</p> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions. </li> <li>Unattended \"training\" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources. </li> <p>Getting your docker ready is also a matter of which type of workload you are currently running.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#build-workloads","title":"Build Workloads","text":"<p>With \"build\" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.</p> <p>Start a docker container by running:</p> <pre><code>docker run -it .... \"the well known image\" -v /where/my/code/resides bash </code></pre> <p>You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.</p> <p>You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the \"server software\" (e.g. a Jupyter Notebook service).</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-workloads","title":"Training Workloads","text":"<p>For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:</p> <ol><li>Base image is nvidia-tensorflow</li> <li>Install popular software</li> <li>(Optional) Run a script</li> </ol> <p>The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run. </p> <p>The best practice for running training workloads is to test the container image in a \"build\" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/","title":"Best Practice: Convert your Workload to Run Unattended","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#motivation","title":"Motivation","text":"<p>Run:ai allows non-interactive training workloads to extend beyond guaranteed quotas and into over-quota as long as computing resources are available. To achieve this kind of flexibility, the system needs to be able to safely stop a workload and restart it again later. This requires Researchers to switch workloads from running interactively, to running unattended, thus allowing Run:ai to pause/resume the run.</p> <p>Unattended workloads are a good fit for long-duration runs, or sets of smaller hyperparameter optimization runs.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#best-practices","title":"Best Practices","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#docker-image","title":"Docker Image","text":"<p>A docker container is based on a docker image. Some Researchers use generic images such as ones provided by Nvidia, for example: NVIDIA NGC TensorFlow.  Others, use generic images as the base image to a more customized image using Dockerfiles.</p> <p>Realizing that Researchers are not always proficient with building docker files, as a best practice, you will want to:</p> <ul> <li>Use the same docker image both for interactive and unattended jobs. In this way, you can keep the difference between both methods of invocation to a minimum. This can be a stock image from Nvidia or a custom image.</li> <li>Leave some degree of flexibility, which allows the Researcher to add/remove python dependencies without re-creating images.</li> </ul>"},{"location":"Researcher/best-practices/convert-to-unattended/#code-location","title":"Code Location","text":"<p>You will want to minimize the cycle of code change-and-run. There are a couple of best practices which you can choose from:</p> <ol> <li>Code resides on the network file storage. This way you can change the code and immediately run the Job. The Job picks up the new files from the network.</li> <li>Use the <code>runai submit</code> flag <code>--git-sync</code>. The flag allows the Researcher to provide details of a Git repository. The repository will be automatically cloned into a specified directory when the container starts.</li> <li>The code can be embedded within the image. In this case, you will want to create an automatic CI/CD process, which packages the code into a modified image. </li> </ol> <p>The document below assumes option #1. </p>"},{"location":"Researcher/best-practices/convert-to-unattended/#create-a-startup-script","title":"Create a Startup Script","text":"<p>Gather the commands you ran inside the interactive Job into a single script. The script will be provided with the command-line at the start of the unattended execution (see the section running the job below). This script should be kept next to your code, on a shared network drive (e.g. /nfs/john).</p> <p>An example of a common startup script start.sh:</p> <pre><code>pip install -r requirements.txt\n...\npython training.py\n</code></pre> <p>The first line of this script is there to make sure that all required python libraries are installed before the training script executes, it also allows the Researcher to add/remove libraries without needing changes to the image itself.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#support-variance-between-different-runs","title":"Support Variance Between Different Runs","text":"<p>Your training script must be flexible enough to support variance in execution without changing the code. For example, you will want to change the number of epochs to run, apply a different set of hyperparameters, etc. There are two ways to handle this in your script. You can use one or both methods:</p> <ol> <li> <p>Your script can read arguments passed to the script:</p> <p><pre><code>python training.py --number-of-epochs=30</code></pre></p> </li> </ol> <p>In which case, change your start.sh script to:</p> <pre><code>pip install -r requirements.txt\n...\npython training.py $@</code></pre> <ol> <li>Your script can read from environment variables during script execution. In case you use environment variables, the variables will be passed to the training script automatically. No special action is required in this case.</li> </ol>"},{"location":"Researcher/best-practices/convert-to-unattended/#checkpoints","title":"Checkpoints","text":"<p>Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save your weights at various checkpoints and start a workload from the latest checkpoint (typically between epochs).</p> <p>TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).</p> <p>It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node</p> <p>For more information on best practices for saving checkpoints, see Saving Deep Learning Checkpoints.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#running-the-job","title":"Running the Job","text":"<p>Using <code>runai submit</code>, drop the flag <code>--interactive</code>. For submitting a Job using the script created above, please use <code>-- [COMMAND]</code> flag to specify a command, use the <code>--</code> syntax to pass arguments, and pass environment variables using the flag <code>--environment</code>.</p> <p>Example with Environment variables:</p> <pre><code>runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -e 'EPOCHS=30'  -e 'LEARNING_RATE=0.02'  \n    -- ./startup.sh  \n</code></pre> <p>Example with Command-line arguments:</p> <pre><code>runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -- ./startup.sh batch-size=64 number-of-epochs=3\n</code></pre> <p>Please refer to Command-Line Interface, runai submit for a list of all arguments accepted by the Run:ai CLI.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#use-cli-policies","title":"Use CLI Policies","text":"<p>Different run configurations may vary significantly and can be tedious to be written each time on the command-line. To make life easier, our CLI offers a way to set administrator policies for these configurations and use pre-configured configuration when submitting a Workload. Please refer to Configure Command-Line Interface Policies. </p>"},{"location":"Researcher/best-practices/convert-to-unattended/#attached-files","title":"Attached Files","text":"<p>The 3 relevant files mentioned in this document can be downloaded from Github</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#see-also","title":"See Also","text":"<p>See the unattended training Quickstart: Launch Unattended Training Workloads</p>"},{"location":"Researcher/best-practices/env-variables/","title":"Environment Variables inside a Run:ai Workload","text":""},{"location":"Researcher/best-practices/env-variables/#identifying-a-job","title":"Identifying a Job","text":"<p>There may be use cases where your container may need to uniquely identify the Job it is currently running in. A typical use case is for saving Job artifacts under a unique name.  Run:ai provides pre-defined environment variables you can use. These variables are guaranteed to be unique even if the Job is preempted or evicted and then runs again. </p> <p>Run:ai provides the following environment variables:</p> <ul> <li><code>JOB_NAME</code> - the name of the Job.</li> <li><code>JOB_UUID</code> - a unique identifier for the Job. </li> </ul> <p>Note that the Job can be deleted and then recreated with the same name. A Job UUID will be different even if the Job names are the same.</p>"},{"location":"Researcher/best-practices/env-variables/#identifying-a-pod","title":"Identifying a Pod","text":"<p>With Hyperparameter Optimization, experiments are run as Pods within the Job. Run:ai provides the following environment variables to identify the Pod.</p> <ul> <li><code>POD_INDEX</code> -  An index number (0, 1, 2, 3....) for a specific Pod within the Job. This is useful for Hyperparameter Optimization to allow easy mapping to individual experiments. The Pod index will remain the same if restarted (due to a failure or preemption). Therefore, it can be used by the Researcher to identify experiments. </li> <li><code>POD_UUID</code> - a unique identifier for the Pod. if the Pod is restarted, the Pod UUID will change.</li> </ul>"},{"location":"Researcher/best-practices/env-variables/#gpu-allocation","title":"GPU Allocation","text":"<p>Run:ai provides an environment variable, visible inside the container, to help identify the number of GPUs allocated for the container. Use <code>RUNAI_NUM_OF_GPUS</code></p>"},{"location":"Researcher/best-practices/env-variables/#node-name","title":"Node Name","text":"<p>There may be use cases where your container may need to identify the node it is currently running on. Run:ai provides an environment variable, visible inside the container, to help identify the name of the node on which the pod was scheduled. Use <code>NODE_NAME</code></p>"},{"location":"Researcher/best-practices/env-variables/#usage-example-in-python","title":"Usage Example in Python","text":"<pre><code>import os\njobName = os.environ['JOB_NAME']\njobUUID = os.environ['JOB_UUID']\n</code></pre>"},{"location":"Researcher/best-practices/save-dl-checkpoints/","title":"Best Practice: Save Deep-Learning Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#introduction","title":"Introduction","text":"<p>Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save the state of your run at various checkpoints and start a workload from the latest checkpoint (typically between epochs).</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#how-to-save-checkpoints","title":"How to Save Checkpoints","text":"<p>TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).</p> <p>This document uses Keras as an example. The code itself can be found here</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#where-to-save-checkpoints","title":"Where to Save Checkpoints","text":"<p>It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node. Example:</p> <pre><code>runai submit train-with-checkpoints -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n</code></pre> <p>The command saves the checkpoints in an NFS checkpoints folder <code>/mnt/nfs_share/john</code></p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#when-to-save-checkpoints","title":"When to Save Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-periodically","title":"Save Periodically","text":"<p>It is a best practice to save checkpoints at intervals. For example, every epoch as the Keras code below shows:</p> <pre><code>checkpoints_file = \"weights.best.hdf5\"\ncheckpoint = ModelCheckpoint(checkpoints_file, monitor='val_acc', verbose=1, \nsave_best_only=True, mode='max')\n</code></pre>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-on-exit-signal","title":"Save on Exit Signal","text":"<p>If periodic checkpoints are not enough, you can use a signal-hook provided by Run:ai (via Kubernetes). The hook is python code that is called before your Job is suspended and allows you to save your checkpoints as well as other state data you may wish to store.</p> <pre><code>import signal\nimport time\ndef graceful_exit_handler(signum, frame):\n# save your checkpoints to shared storage\n# exit with status \"1\" is important for the Job to return later.  \nexit(1)\nsignal.signal(signal.SIGTERM, graceful_exit_handler)\n</code></pre> <p>By default, you will have 30 seconds to save your checkpoints.</p> <p>Important</p> <p>For the signal to be captured, it must be propagated from the startup script to the python child process. See code here</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#resuming-using-saved-checkpoints","title":"Resuming using Saved Checkpoints","text":"<p>A Run:ai unattended workload that is resumed, will run the same startup script as on the first run. It is the responsibility of the script developer to add code that:</p> <ul> <li>Checks if saved checkpoints exist (see above)</li> <li>If saved checkpoints exist, load them and start the run using these checkpoints</li> </ul> <pre><code>import os\ncheckpoints_file = \"weights.best.hdf5\"\nif os.path.isfile(checkpoints_file):\nprint(\"loading checkpoint file: \" + checkpoints_file)\nmodel.load_weights(checkpoints_file)\n</code></pre>"},{"location":"Researcher/cli-reference/Introduction/","title":"Introduction","text":"<p>The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.</p> <p>To install and configure the Run:ai CLI see Researcher Setup - Start Here</p>"},{"location":"Researcher/cli-reference/runai-attach/","title":"runai attach","text":""},{"location":"Researcher/cli-reference/runai-attach/#description","title":"Description","text":"<p>Attach to a running Job.</p> <p>The command attaches to the standard input, output, and error streams of a running Job. If the Job has multiple pods the job will attach to the first pod unless otherwise set.</p>"},{"location":"Researcher/cli-reference/runai-attach/#synopsis","title":"Synopsis","text":"<pre><code>runai attach &lt;job-name&gt;\n    [--no-stdin ]\n    [--no-tty]   \n    [--pod string]\n    .\n    [--loglevel value] \n    [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-attach/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-attach/#-no-stdin","title":"--no-stdin","text":"<p>Do not attach STDIN.</p>"},{"location":"Researcher/cli-reference/runai-attach/#-no-tty","title":"--no-tty","text":"<p>Do not allocate a pseudo-TTY</p>"},{"location":"Researcher/cli-reference/runai-attach/#-pod-string","title":"--pod string","text":"<p>Attach to a specific pod within the Job. To find the list of pods run <code>runai describe job &lt;job-name&gt;</code> and then use the pod name with the <code>--pod</code> flag.</p>"},{"location":"Researcher/cli-reference/runai-attach/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-attach/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-attach/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-attach/#output","title":"Output","text":"<p>None</p>"},{"location":"Researcher/cli-reference/runai-bash/","title":"runai bash","text":""},{"location":"Researcher/cli-reference/runai-bash/#description","title":"Description","text":"<p>Get a bash session inside a running Job</p> <p>This command is a shortcut to runai exec (<code>runai exec -it job-name bash</code>). See runai exec for full documentation of the exec command.</p>"},{"location":"Researcher/cli-reference/runai-bash/#synopsis","title":"Synopsis","text":"<pre><code>runai bash &lt;job-name&gt; [--pod string]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-bash/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-bash/#-pod-string","title":"--pod string","text":"<p>Specify a pod of a running Job. To get a list of the pods of a specific Job, run <code>runai describe job &lt;job-name&gt;</code> command</p>"},{"location":"Researcher/cli-reference/runai-bash/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-bash/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>"},{"location":"Researcher/cli-reference/runai-bash/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-bash/#-help-h","title":"--help | -h","text":"<p>Show help text</p>"},{"location":"Researcher/cli-reference/runai-bash/#output","title":"Output","text":"<p>The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.</p> <p>The command will return an error if the container does not exist or has not been in a running state yet.</p>"},{"location":"Researcher/cli-reference/runai-bash/#see-also","title":"See also","text":"<p>Build Workloads. See Quickstart document: Launch Interactive Build Workloads.</p>"},{"location":"Researcher/cli-reference/runai-config/","title":"runai config","text":""},{"location":"Researcher/cli-reference/runai-config/#description","title":"Description","text":"<p>Set a default Project or Cluster</p>"},{"location":"Researcher/cli-reference/runai-config/#synopsis","title":"Synopsis","text":"<pre><code>runai  config project &lt;project-name&gt;\n    [--loglevel value] [--help | -h]\nrunai  config cluster &lt;cluster-name&gt;\n    [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-config/#options","title":"Options","text":"<p>&lt;project-name&gt;  - The name of the Project you want to set as default. Mandatory.</p> <p>&lt;cluster-name&gt; - The name of the cluster you want to set as the current cluster. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-config/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-config/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-config/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-config/#output","title":"Output","text":"<p>None</p>"},{"location":"Researcher/cli-reference/runai-delete/","title":"runai delete","text":""},{"location":"Researcher/cli-reference/runai-delete/#description","title":"Description","text":"<p>Delete a Workload and its associated Pods.</p> <p>Note that once you delete a Workload, its entire data will be gone:</p> <ul> <li>You will no longer be able to enter it via bash.</li> <li>You will no longer be able to access logs.</li> <li>Any data saved on the container and not stored in a shared location will be lost.</li> </ul>"},{"location":"Researcher/cli-reference/runai-delete/#synopsis","title":"Synopsis","text":"<pre><code>runai delete job &lt;job-name&gt; [--all | -A]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-delete/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-delete/#-all-a","title":"--all | -A","text":"<p>Delete all Workloads.</p>"},{"location":"Researcher/cli-reference/runai-delete/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-delete/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-delete/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-delete/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-delete/#output","title":"Output","text":"<ul> <li> <p>The Workload will be deleted and not available via the command runai list jobs.</p> </li> <li> <p>The Workloads will show as <code>deleted</code> from the Run:ai user interface Job list.</p> </li> </ul>"},{"location":"Researcher/cli-reference/runai-delete/#see-also","title":"See Also","text":"<ul> <li> <p>Build Workloads. See Quickstart document: Launch Interactive Build Workloads.</p> </li> <li> <p>Training Workloads. See Quickstart document:  Launch Unattended Training Workloads.</p> </li> </ul>"},{"location":"Researcher/cli-reference/runai-describe/","title":"runai describe","text":""},{"location":"Researcher/cli-reference/runai-describe/#description","title":"Description","text":"<p>Display details of a Workload or Node.</p>"},{"location":"Researcher/cli-reference/runai-describe/#synopsis","title":"Synopsis","text":"<pre><code>runai describe job &lt;job-name&gt; [--output value | -o value]  [--loglevel value] [--project string | -p string] [--help | -h]\n[--output string | -o string]  runai describe node [node-name] [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-describe/#options","title":"Options","text":"<ul> <li>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</li> <li>&lt;node-name&gt; - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.</li> </ul> <p>-o | --output</p> <p>Output format. One of: json|yaml|wide. Default is 'wide'</p>"},{"location":"Researcher/cli-reference/runai-describe/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-describe/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-describe/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-describe/#-help-h","title":"--help | -h","text":"<p>Show help text</p>"},{"location":"Researcher/cli-reference/runai-describe/#output","title":"Output","text":"<ul> <li>The <code>runai describe job</code> command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.</li> <li>The <code>runai describe node</code> command will show Node properties. </li> </ul>"},{"location":"Researcher/cli-reference/runai-exec/","title":"runai exec","text":""},{"location":"Researcher/cli-reference/runai-exec/#description","title":"Description","text":"<p>Execute a command inside a running Job</p> <p>Note: to execute a bash command, you can also use the shorthand runai bash</p>"},{"location":"Researcher/cli-reference/runai-exec/#synopsis","title":"Synopsis","text":"<pre><code>runai exec &lt;job-name&gt; &lt;command&gt; [--stdin | -i] [--tty | -t]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-exec/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <p>&lt;command&gt; the command itself (e.g. bash).</p>"},{"location":"Researcher/cli-reference/runai-exec/#-stdin-i","title":"--stdin | -i","text":"<p>Keep STDIN open even if not attached.</p>"},{"location":"Researcher/cli-reference/runai-exec/#-tty-t","title":"--tty | -t","text":"<p>Allocate a pseudo-TTY.</p>"},{"location":"Researcher/cli-reference/runai-exec/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-exec/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-exec/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-exec/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-exec/#output","title":"Output","text":"<p>The command will run in the context of the container.</p>"},{"location":"Researcher/cli-reference/runai-exec/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-list/","title":"runai list","text":""},{"location":"Researcher/cli-reference/runai-list/#description","title":"Description","text":"<p>Show lists of Workloads, Projects, Clusters or Nodes.</p>"},{"location":"Researcher/cli-reference/runai-list/#synopsis","title":"Synopsis","text":"<pre><code>runai list jobs [--all-projects | -A]  [--loglevel value] [--project string | -p string] [--help | -h]\nrunai list projects [--loglevel value] [--help | -h]\nrunai list clusters  [--loglevel value] [--help | -h]\nrunai list nodes [node-name]\n[--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-list/#options","title":"Options","text":"<p><code>node-name</code> - Name of a specific node to list (optional).</p>"},{"location":"Researcher/cli-reference/runai-list/#-all-projects-a","title":"--all-projects | -A","text":"<p>Show Workloads from all Projects.</p>"},{"location":"Researcher/cli-reference/runai-list/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-list/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-list/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-list/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-list/#output","title":"Output","text":"<ul> <li>A list of Workloads, Nodes, Projects, or Clusters. </li> <li>To filter 'runai list nodes' for a specific Node, add the Node name.</li> </ul>"},{"location":"Researcher/cli-reference/runai-list/#see-also","title":"See Also","text":"<p>To show details for a specific Workload or Node see runai describe.</p>"},{"location":"Researcher/cli-reference/runai-login/","title":"runai login","text":""},{"location":"Researcher/cli-reference/runai-login/#description","title":"Description","text":"<p>Login to Run:ai</p> <p>When Researcher Authentication is enabled, you will need to login to Run:ai using your username and password before accessing resources </p>"},{"location":"Researcher/cli-reference/runai-login/#synopsis","title":"Synopsis","text":"<pre><code>runai login [--loglevel value]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-login/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-login/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-login/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-login/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-login/#output","title":"Output","text":"<p>You will be prompted for a user name and password</p>"},{"location":"Researcher/cli-reference/runai-login/#see-also","title":"See Also","text":"<ul> <li>runai logout.</li> </ul>"},{"location":"Researcher/cli-reference/runai-logout/","title":"runai logout","text":""},{"location":"Researcher/cli-reference/runai-logout/#description","title":"Description","text":"<p>Log out from Run:ai</p>"},{"location":"Researcher/cli-reference/runai-logout/#synopsis","title":"Synopsis","text":"<pre><code>runai logout [--loglevel value]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-logout/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-logout/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logout/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-logout/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-logout/#output","title":"Output","text":"<p>You will be logged out from Run:ai</p>"},{"location":"Researcher/cli-reference/runai-logout/#see-also","title":"See Also","text":"<ul> <li>runai login.</li> </ul>"},{"location":"Researcher/cli-reference/runai-logs/","title":"runai logs","text":""},{"location":"Researcher/cli-reference/runai-logs/#description","title":"Description","text":"<p>Show the logs of a Job.</p>"},{"location":"Researcher/cli-reference/runai-logs/#synopsis","title":"Synopsis","text":"<pre><code>runai logs &lt;job-name&gt; [--follow | -f] [--pod string | -p string] [--since duration] [--since-time date-time] [--tail int | -t int] [--timestamps]  [--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-logs/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-follow-f","title":"--follow | -f","text":"<p>Stream the logs.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-pod-p","title":"--pod | -p","text":"<p>Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-instance-string-i-string","title":"--instance (string) | -i (string)","text":"<p>Show logs for a specific instance in cases where a Job contains multiple pods.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-since-duration","title":"--since (duration)","text":"<p>Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-since-time-date-time","title":"--since-time (date-time)","text":"<p>Return logs after specified date. Date format should be RFC3339, example: <code>2020-01-26T15:00:00Z</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-tail-int-t-int","title":"--tail (int) | -t (int)","text":"<p># of lines of recent log file to display.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-timestamps","title":"--timestamps","text":"<p>Include timestamps on each line in the log output.</p>"},{"location":"Researcher/cli-reference/runai-logs/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logs/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-logs/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-logs/#output","title":"Output","text":"<p>The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.</p>"},{"location":"Researcher/cli-reference/runai-logs/#see-also","title":"See Also","text":"<ul> <li>Training Workloads. See Quickstart document:  Launch Unattended Training Workloads.</li> </ul>"},{"location":"Researcher/cli-reference/runai-port-forwarding/","title":"runai port-forward","text":""},{"location":"Researcher/cli-reference/runai-port-forwarding/#description","title":"Description","text":"<p>Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.</p>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#examples","title":"Examples","text":"<ol> <li> <p>Port forward connections from localhost:8080 (localhost is the default) to  on port 8090. <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090</code></p> <li> <p>Port forward connections from 192.168.1.23:8080 to  on port 8080. <p><code>runai port-forward &lt;job-name&gt; --port 8080 --address 192.168.1.23</code></p> <li> <p>Port forward multiple connections from localhost:8080 to  on port 8090 and localhost:6443 to  on port 443. <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090  --port 6443:443</code></p> <li> <p>Port forward into a specific pod in a multi-pod job.</p> <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090 --pod &lt;pod-name&gt;</code></p> </li>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#global-flags","title":"Global flags","text":"<p><code>--loglevel &lt;string&gt;</code>\u2014Set the logging level. Choose:  (default \"info\"). <p><code>-p | --project &lt;string&gt;</code>\u2014Specify the project name. To change the default project use <code>runai config project &lt;project name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#flags","title":"Flags","text":"<p><code>--address &lt;string&gt; | [local-interface-ip\\host] |localhost | 0.0.0.0 [privileged]</code>\u2014The listening address of your local machine. (default \"localhost\").</p> <p><code>-h | --help</code>\u2014Help for the command.</p> <p><code>--port</code>\u2014forward ports based on one of the following arguments:</p> <ul> <li> <p><code>&lt;stringArray&gt;</code>\u2014a list of port forwarding combinations.</p> </li> <li> <p><code>[local-port]:[remote-port]</code>\u2014different local and remote ports.</p> </li> <li> <p><code>[local-port=remote-port]</code>\u2014the same port is used for both local and remote.</p> </li> </ul> <p><code>--pod</code>\u2014Specify a pod of a running job. To get a list of the pods of a specific job, run the command <code>runai describe &lt;job-name&gt;</code>.</p> <p><code>--pod-running-timeout</code>\u2014The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.</p> <p>Filter based flags</p> <p><code>--mpi</code>\u2014search only for mpi jobs.</p> <p><code>--interactive</code>\u2014search only for interactive jobs.</p> <p><code>--pytorch</code>\u2014search only for pytorch jobs.</p> <p><code>--tf</code>\u2014search only for tensorflow jobs.</p> <p><code>--train</code>\u2014search only for training jobs.</p>"},{"location":"Researcher/cli-reference/runai-resume/","title":"runai resume","text":""},{"location":"Researcher/cli-reference/runai-resume/#description","title":"Description","text":"<p>Resume a suspended Job</p> <p>Resuming a previously suspended Job will return it to the queue for scheduling. The Job may or may not start immediately, depending on available resources. </p> <p>Suspend and resume do not work with mpi Jobs. </p>"},{"location":"Researcher/cli-reference/runai-resume/#synopsis","title":"Synopsis","text":"<pre><code>runai resume &lt;job-name&gt;\n    [--all | -A]\n[--loglevel value]\n[--project string | -p string]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-resume/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-resume/#-all-a","title":"--all | -A","text":"<p>Resume all suspended Jobs in the current Project.</p>"},{"location":"Researcher/cli-reference/runai-resume/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-resume/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-resume/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-resume/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-resume/#output","title":"Output","text":"<ul> <li>The Job will be resumed. When running runai list jobs the Job status will no longer by Suspended.</li> </ul>"},{"location":"Researcher/cli-reference/runai-resume/#see-also","title":"See Also","text":"<ul> <li>Suspending Jobs: Suspend.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/","title":"runai submit-dist tf","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#description","title":"Description","text":"<p> Version 2.10 and later.</p> <p>Submit a distributed TensorFlow training run:ai job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the &lt; insert  TensorFlow operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#examples","title":"Examples","text":"<pre><code>runai submit-dist tf --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name\n&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/","title":"runai submit-dist mpi","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#description","title":"Description","text":"<p>Submit a Distributed Training (MPI) Run:ai Job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the Kubeflow MPI Operator as specified here</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#examples","title":"Examples","text":"<p>You can start an unattended mpi training Job of name dist1, based on Project team-a using a quickstart-distributed image:</p> <pre><code>runai submit-dist mpi --name dist1 --workers=2 -g 1 \\\n    -i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n</code></pre> <p>(see: distributed training Quickstart).</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-workers-int","title":"--workers &lt; int &gt;","text":"<p>Number of replicas for Inference jobs.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-slots-per-worker-int","title":"--slots-per-worker &lt; int &gt;","text":"<p>Number of slots to allocate for each worker.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/","title":"runai submit-dist pytorch","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#description","title":"Description","text":"<p> Version 2.10 and later.</p> <p>Submit a distributed PyTorch training run:ai job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the &lt; insert pytorch operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#examples","title":"Examples","text":"<pre><code>runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-max-replicas-int","title":"--max-replicas &lt; int &gt;","text":"<p>Maximum number of replicas for elastic PyTorch job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-min-replicas-int","title":"--min-replicas &lt; int &gt;","text":"<p>Minimum number of replicas for elastic PyTorch job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#see-also","title":"See Also","text":"<p>&lt; please let me know if this is needed, or if additional documentation is needed in the link &gt; *   See Quickstart document Running Distributed Training.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/","title":"runai submit-dist xgboost","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#description","title":"Description","text":"<p>Submit a distributed XGBoost training run:ai job to run.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#examples","title":"Examples","text":"<pre><code>runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name\n&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit/","title":"runai submit","text":""},{"location":"Researcher/cli-reference/runai-submit/#description","title":"Description","text":"<p>Submit a Run:ai Job for execution.</p> <p>Syntax notes:</p> <ul> <li>Flags of type stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit/#examples","title":"Examples","text":"<p>All examples assume a Run:ai Project has been setup using <code>runai config project &lt;project-name&gt;</code>.</p> <p>Start an interactive Job:</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1\n</code></pre> <p>Or</p> <pre><code>runai submit --name build1 -i ubuntu -g 1 --interactive -- sleep infinity \n</code></pre> <p>(see: build Quickstart).</p> <p>Externalize ports:</p> <pre><code>runai submit --name build-remote -i rastasheep/ubuntu-sshd:14.04 --interactive \\\n   --service-type=nodeport --port 30022:22\n   -- /usr/sbin/sshd -D\n</code></pre> <p>(see: build with ports Quickstart).</p> <p>Start a Training Job</p> <pre><code>runai submit --name train1 -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre> <p>(see: training Quickstart).</p> <p>Use GPU Fractions</p> <pre><code>runai submit --name frac05 -i gcr.io/run-ai-demo/quickstart -g 0.5\n</code></pre> <p>(see: GPU fractions Quickstart).</p> <p>Hyperparameter Optimization</p> <pre><code>runai submit --name hpo1 -i gcr.io/run-ai-demo/quickstart-hpo -g 1  \\\n   --parallelism 3 --completions 12 -v /nfs/john/hpo:/hpo \n</code></pre> <p>(see: hyperparameter optimization Quickstart).</p> <p>Submit a Job without a name (automatically generates a name)</p> <pre><code>runai submit -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre> <p>Submit a job using the system autogenerated name to an external URL:</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745 --custom-url=&lt;destination_url&gt;\n</code></pre> <p>Submit a job without a name to a system generated a URL :</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745\n</code></pre> <p>Submit a Job without a name with a pre-defined prefix and an incremental index suffix</p> <pre><code>runai submit --job-name-prefix -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-type","title":"Job Type","text":""},{"location":"Researcher/cli-reference/runai-submit/#-interactive","title":"--interactive","text":"<p>Mark this Job as interactive.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-jupyter","title":"--jupyter","text":"<p>Run a Jupyter notebook using a default image and notebook configuration.</p>"},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-completions-int","title":"--completions &lt; int &gt;","text":"<p>Number of successful pods required for this job to be completed. Used with HPO.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-parallelism-int","title":"--parallelism &lt; int &gt;","text":"<p>Number of pods to run in parallel at any given time.  Used with HPO.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-preemptible","title":"--preemptible","text":"<p>Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-ttl-after-finish-duration","title":"--ttl-after-finish &lt; duration &gt;","text":"<p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p>"},{"location":"Researcher/cli-reference/runai-submit/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle_1","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are:</p> <ul> <li><code>portforward</code> (deprecated)</li> <li><code>loadbalancer</code></li> <li><code>nodeport</code></li> <li><code>external-url</code></li> </ul>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-custom-url-string","title":"--custom-url <code>&lt;string&gt;</code>  <p>An optional argument that specifies a custom URL when using the <code>external URL</code> service type. If not provided, the system will generate a URL automatically.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#output","title":"Output","text":"<p>The command will attempt to submit a Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <p>Note that the submit call may use a policy to provide defaults to any of the above flags.</p>"},{"location":"Researcher/cli-reference/runai-submit/#see-also","title":"See Also","text":"<ul> <li>See any of the Quickstart documents here:.</li> <li>See policy configuration for a description on how policies work.</li> </ul>"},{"location":"Researcher/cli-reference/runai-suspend/","title":"runai suspend","text":""},{"location":"Researcher/cli-reference/runai-suspend/#description","title":"Description","text":"<p>Suspend a Job</p> <p>Suspending a Running Job will stop the Job and will not allow it to be scheduled until it is resumed using <code>runai resume</code>. This means that,</p> <ul> <li>You will no longer be able to enter it via <code>runai bash</code>.</li> <li>The Job logs will be deleted.</li> <li>Any data saved on the container and not stored in a shared location will be lost.</li> </ul> <p>Technically, the command deletes the Kubernetes pods associated with the Job and marks the Job as suspended until it is manually released. </p> <p>Suspend and resume do not work with MPI and Inference </p>"},{"location":"Researcher/cli-reference/runai-suspend/#synopsis","title":"Synopsis","text":"<pre><code>runai suspend &lt;job-name&gt;\n    [--all | -A]\n[--loglevel value]\n[--project string | -p string]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-suspend/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-all-a","title":"--all | -A","text":"<p>Suspend all Jobs in the current Project.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-suspend/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#output","title":"Output","text":"<ul> <li>The Job will be suspended. When running runai list jobs the Job will be marked as Suspended.</li> </ul>"},{"location":"Researcher/cli-reference/runai-suspend/#see-also","title":"See Also","text":"<ul> <li>Resuming Jobs: Resume.</li> </ul>"},{"location":"Researcher/cli-reference/runai-top-node/","title":"runai top node","text":""},{"location":"Researcher/cli-reference/runai-top-node/#description","title":"Description","text":"<p>Show list of Nodes (machines), their capacity and utilization.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#synopsis","title":"Synopsis","text":"<pre><code>runai top node \n    [--help | -h]\n    [--details | -d]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-top-node/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-top-node/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-top-node/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-top-node/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#-details-d","title":"--details | -d","text":"<p>Show additional details.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#output","title":"Output","text":"<p>Shows a list of Nodes their capacity and utilization.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-update/","title":"runai update","text":""},{"location":"Researcher/cli-reference/runai-update/#description","title":"Description","text":"<p>Find and install the latest version of the runai command-line utility. The command must be run with sudo permissions.</p> <pre><code>sudo runai update\n</code></pre>"},{"location":"Researcher/cli-reference/runai-update/#synopsis","title":"Synopsis","text":"<pre><code>runai update [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-update/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-update/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-update/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-update/#output","title":"Output","text":"<p>Update of the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-update/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-version/","title":"runai version","text":""},{"location":"Researcher/cli-reference/runai-version/#description","title":"Description","text":"<p>Show the version of this utility.</p>"},{"location":"Researcher/cli-reference/runai-version/#synopsis","title":"Synopsis","text":"<pre><code>runai version [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-version/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-version/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-version/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-version/#output","title":"Output","text":"<p>The version of the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-version/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-whoami/","title":"runai whoami","text":""},{"location":"Researcher/cli-reference/runai-whoami/#description","title":"Description","text":"<p>Show the user name currently logged in</p>"},{"location":"Researcher/cli-reference/runai-whoami/#synopsis","title":"Synopsis","text":"<pre><code>runai whoami [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-whoami/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-whoami/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-whoami/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-whoami/#output","title":"Output","text":"<p>The name of the User currently logged in with the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-whoami/#see-also","title":"See Also","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/","title":"Allocation of CPU and Memory","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#introduction","title":"Introduction","text":"<p>When we discuss the allocation of deep learning compute resources, the discussion tends to focus on GPUs as the most critical resource. But two additional resources are no less important:</p> <ul> <li>CPUs. Mostly needed for preprocessing and postprocessing tasks during a deep learning training run.</li> <li>Memory. Has a direct influence on the quantities of data a training run can process in batches.</li> </ul> <p>GPU servers tend to come installed with a significant amount of memory and CPUs.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#requesting-cpu-memory","title":"Requesting CPU &amp; Memory","text":"<p>When submitting a Job, you can request a guaranteed amount of CPUs and memory by using the --cpu and --memory flags in the runai submit command. For example:</p> <pre><code>runai submit job1 -i ubuntu --gpu 2 --cpu 12 --memory 1G\n</code></pre> <p>The system guarantees that if the Job is scheduled, you will be able to receive this amount of CPU and memory.</p> <p>For further details on these flags see: runai submit</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-over-allocation","title":"CPU over allocation","text":"<p>The number of CPUs your Job will receive is guaranteed to be the number defined using the <code>--cpu</code> flag. In practice, however, you may receive more CPUs than you have asked for:</p> <ul> <li>If you are currently alone on a node, you will receive all the node CPUs until such time when another workload has joined.</li> <li>However, when a second workload joins, each workload will receive a number of CPUs proportional to the number requested via the <code>--cpu</code> flag. For example, if the first workload asked for 1 CPU and the second for 3 CPUs, then on a node with 40 cpus, the workloads will receive 10 and 30 CPUs respectively. If the flag <code>--cpu</code> is not specified, it will be taken from the cluster default (see the section below)</li> </ul>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#memory-over-allocation","title":"Memory over allocation","text":"<p>The amount of Memory your Job will receive is guaranteed to be the number defined using the --memory flag. In practice, however, you may receive more memory than you have asked for. This is along the same lines as described with CPU over allocation above.</p> <p>It is important to note, however, that if you have used this memory over-allocation, and new workloads have joined, your Job may receive an out-of-memory exception and terminate.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-and-memory-limits","title":"CPU and Memory limits","text":"<p>You can limit your Job's allocation of CPU and memory by using the --cpu-limit and --memory-limit flags in the runai submit command. For example:</p> <pre><code>runai submit job1 -i ubuntu --gpu 2 --cpu 12 --cpu-limit 24 \\\n    --memory 1G --memory-limit 4G\n</code></pre> <p>The limit behavior is different for CPUs and memory.</p> <ul> <li>Your Job will never be allocated with more than the amount stated in the <code>--cpu-limit</code> flag</li> <li>If your Job tries to allocate more than the amount stated in the <code>--memory-limit</code> flag it will receive an out-of-memory exception.</li> </ul> <p>The limit (for both CPU and memory) overrides the cluster default described in the section below</p> <p>For further details on these flags see: runai submit</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#flag-defaults","title":"Flag Defaults","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-flag","title":"Defaults for --cpu flag","text":"<p>If your Job has not specified <code>--cpu</code>, the system will use a default. The default is cluster-wide and is defined as a ratio of GPUs to CPUs.</p> <p>If, for example, the default has been defined as 1:6 and your Job has specified <code>--gpu 2</code> and has not specified <code>--cpu</code>, then the implied <code>--cpu</code> flag value is 12 CPUs.</p> <p>The system comes with a cluster-wide default of 1:1. To change the ratio see below.</p> <p>If you didn't request any GPUs for your job and has not specified <code>--cpu</code>, the default is defined as a ratio of CPU limit to CPUs.</p> <p>If, for example, the default has been defined as 1:0.2 and your Job has specified <code>--cpu-limit 10</code> and has not specified <code>--cpu</code>, then the implied <code>--cpu</code> flag value is 2 CPUs.</p> <p>The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-flag","title":"Defaults for --memory flag","text":"<p>If your Job has not specified <code>--memory</code>, the system will use a default. The default is cluster-wide and is proportional to the number of requested GPUs.</p> <p>The system comes with a cluster-wide default of 100MiB of allocated CPU memory per GPU. To change the ratio see below.</p> <p>If you didn't request any GPUs for your job and has not specified <code>--memory</code>, the default is defined as a ratio of CPU Memory limit to CPU Memory Request.</p> <p>The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-limit-flag","title":"Defaults for --cpu-limit flag","text":"<p>If your Job has not specified <code>--cpu-limit</code>, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to CPUs. See below on how to change the ratio.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-limit-flag","title":"Defaults for --memory-limit flag","text":"<p>If your Job has not specified <code>--memory-limit</code>, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to Memory. See below on how to change the ratio.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#changing-the-ratios","title":"Changing the ratios","text":"<p>To change the cluster wide-ratio use the following process. The example shows: </p> <ul> <li>a CPU request with a default ratio of 2:1 CPUs to GPUs.</li> <li>a CPU Memory request with a default ratio of 200MB per GPU.</li> <li>a CPU limit with a default ratio of 4:1 CPU to GPU.</li> <li>a Memory limit with a default ratio of 2GB per GPU.</li> <li>a CPU request with a default ratio of 0.1 CPUs per 1 CPU limit.</li> <li>a CPU Memory request with a default ratio of 0.1:1 request per CPU Memory limit.</li> </ul> <p>You must edit the cluster installation values file:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>You must specify at least the first 4 values as follows: </li> </ul> <pre><code>runai-operator:\nconfig:\nlimitRange:\ncpuDefaultRequestGpuFactor: 2\nmemoryDefaultRequestGpuFactor: 200Mi\ncpuDefaultLimitGpuFactor: 4\nmemoryDefaultLimitGpuFactor: 2Gi\ncpuDefaultRequestCpuLimitFactorNoGpu: 0.1\nmemoryDefaultRequestMemoryLimitFactorNoGpu: 0.1\n</code></pre>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#validating-cpu-memory-allocations","title":"Validating CPU &amp; Memory Allocations","text":"<p>To review CPU &amp; Memory allocations you need to look into Kubernetes. A Run:ai Job creates a Kubernetes pod. The pod declares its resource requests and limits. To see the memory and CPU consumption in Kubernetes:</p> <ul> <li>Get the pod name for the Job by running: <pre><code>runai describe job &lt;JOB_NAME&gt;\n</code></pre> </li> </ul> <p>the pod will appear under the <code>PODS</code> category. </p> <ul> <li>Run:<pre><code>kubectl describe pod &lt;POD_NAME&gt;\n</code></pre> </li> </ul> <p>The information will appear under <code>Requests</code> and <code>Limits</code>. For example:</p> <pre><code>Limits:\nnvidia.com/gpu:  2\nRequests:\ncpu:             1\nmemory:          104857600\nnvidia.com/gpu:  2\n</code></pre>"},{"location":"Researcher/scheduling/fractions/","title":"Allocation of GPU Fractions","text":""},{"location":"Researcher/scheduling/fractions/#introduction","title":"Introduction","text":"<p>A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power. </p> <p>This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization. </p> <p>This article describes two complementary technologies that allow the division of GPUs and how to use them with Run:ai.</p> <ol> <li>Run:ai Fractions. </li> <li>Dynamic allocation using NVIDIA Multi-instance GPU (MIG)</li> </ol>"},{"location":"Researcher/scheduling/fractions/#runai-fractions","title":"Run:ai Fractions","text":"<p>Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag <code>--gpu-memory 4G</code> to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception. </p> <p>You can also use the flag <code>--gpu 0.2</code> to get 20% of the GPU memory on the GPU assigned for you. </p> <p>For more details on Run:ai fractions see the fractions quickstart.</p> <p>Limitation</p> <p>With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.</p> <p>Info</p> <p>For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated <code>runai-reservation</code> namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.    </p>"},{"location":"Researcher/scheduling/fractions/#dynamic-mig","title":"Dynamic MIG","text":"<p>NVIDIA MIG allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be partitioned into separate GPU Instances:</p> <ul> <li>When divided, the portion acts as a fully independent GPU.</li> <li>The division is static, in the sense that you have to call NVIDIA API or the <code>nvidia-smi</code> command to create or remove the MIG partition. </li> <li>The division is both of compute and memory.</li> <li>The division has fixed sizes.  Up to 7 units of compute and memory in fixed sizes. The various MIG profiles can be found in the NVIDIA documentation. A typical profile can be <code>MIG 2g.10gb</code> which provides 2/7 of the compute power and 10GB of RAM</li> <li>Reconfiguration of MIG profiles on the GPU requires administrator permissions and the draining of all running workloads. </li> </ul> <p>Run:ai provides a way to dynamically create a MIG partition:</p> <ul> <li>Using the same experience as the Fractions technology above, if you know that your code needs 4GB of RAM. You can use the flag <code>--gpu-memory 4G</code> to specify the portion of the GPU memory that you need. Run:ai will call the NVIDIA MIG API to generate the smallest possible MIG profile for your request, and allocate it to your container. </li> <li>MIG is configured on the fly according to workload demand, without needing to drain workloads or to involve an IT administrator.</li> <li>Run:ai will automatically deallocate the partition when the workload finishes. This happens in a lazy fashion in the sense that the partition will not be removed until the scheduler decides that it is needed elsewhere. </li> <li>Run:ai provides an additional flag to dynamically create the specific MIG partition in NVIDIA terminology. As such, you can specify <code>--mig-profile 2g.10gb</code>.  </li> <li>In a single GPU cluster you have some MIG nodes that are dynamically allocated and some that are not.</li> </ul> <p>For more details on Run:ai fractions see the dynamic MIG quickstart.</p>"},{"location":"Researcher/scheduling/fractions/#setting-up-dynamic-mig","title":"Setting up Dynamic MIG","text":"<p>As described above, MIG is only available in the latest NVIDIA architecture. </p> <ul> <li>When working with Kubernetes, NVIDIA defines a concept called MIG Strategy. With Run:ai you must set the MIG strategy to <code>mixed</code>. See NVIDIA prerequisites on how to set this flag. </li> <li> <p>The administrator needs to specifically enable dynamic MIG on the node by running: </p> <p><pre><code>runai-adm set node-role --dynamic-mig-enabled &lt;node-name&gt;\n</code></pre> (use <code>runai-adm remove</code> to unset)</p> </li> <li> <p>Make sure that MIG is enabled on the node level by running <code>nvidia-smi</code> on the node and verifying that MIG Mode is enabled (see highlight below):</p> </li> </ul> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   32C    P0    42W / 400W |      0MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n</code></pre> <ul> <li> <p>To enable MIG Mode see NVIDIA documentation.</p> </li> <li> <p>Set:     <pre><code>kubectl label node &lt;node-name&gt; node-role.kubernetes.io/runai-mig-enabled=true\n</code></pre>    (use <code>kubectl</code> to unset)</p> </li> </ul> <p>Limitations</p> <ul> <li>Once a node has been marked as dynamic MIG enabled, it can only be used via the Run:ai scheduler.</li> <li>Run:ai currently supports H100 or A100 nodes with 40GB/80GB RAM.</li> <li>GPU utilization, shown on the Run:ai dashboards, may not be accurate while MIG jobs are running.</li> </ul>"},{"location":"Researcher/scheduling/fractions/#mixing-fractions-and-dynamic-mig","title":"Mixing Fractions and Dynamic MIG","text":"<p>Given a specific node, the IT administrator can decide whether to use one technology or the other. When the Researcher asks for a specific amount of GPU memory, Run:ai will either provide it on an annotated node by dynamically allocating a MIG partition, or use a different node using the fractions technology.</p>"},{"location":"Researcher/scheduling/fractions/#see-also","title":"See Also","text":"<ul> <li>Fractions quickstart.</li> <li>Dynamic MIG quickstart</li> </ul>"},{"location":"Researcher/scheduling/hpo/","title":"Researcher Library: Hyperparameter Optimization Support","text":"<p>The Run:ai Researcher Library is a python library you can add to your deep learning python code. The hyperparameter optimization(HPO) support module of the library is a helper library for hyperparameter optimization (HPO) experiments</p> <p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process. Example hyperparameters: Learning rate, Batch size, Different optimizers, number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine the results to decide what works best.</p> <p>With the reporter module, you can externalize information such as progress, accuracy, and loss over time/epoch, and more. In addition, you can externalize custom metrics of your choosing.</p>"},{"location":"Researcher/scheduling/hpo/#getting-started","title":"Getting Started","text":""},{"location":"Researcher/scheduling/hpo/#prerequisites","title":"Prerequisites","text":"<p>Run:ai HPO library is dependent on PyYAML. Install it using the command:</p> <pre><code>pip install pyyaml\n</code></pre>"},{"location":"Researcher/scheduling/hpo/#installing","title":"Installing","text":"<p>Install the <code>runai</code> Python library using <code>pip</code> using the following command:</p> <pre><code>pip install runai\n</code></pre> <p>Make sure to use the correct <code>pip</code> installer (you might need to use <code>pip3</code> for Python3)</p>"},{"location":"Researcher/scheduling/hpo/#usage","title":"Usage","text":"<ul> <li>Import the <code>runai.hpo</code> package.</li> </ul> <pre><code>import runai.hpo\n</code></pre> <ul> <li>Initialize the Run:ai HPO library with a path to a directory shared between all cluster nodes (typically using an NFS server). We recommend specifying a unique name for the experiment, the name will be used to create a sub-directory on the shared folder. To do so, we recommend using the environment variables <code>JOB_NAME</code> and <code>JOB_UUID</code> which are injected to the container by Run:ai.</li> </ul> <pre><code>hpo_root = '/path/to/nfs'\nhpo_experiment = '%s_%s' % (os.getenv('JOB_NAME'), os.getenv('JOB_UUID'))\nrunai.hpo.init(hpo_root, hpo_experiment)\n</code></pre> <ul> <li>Decide on an HPO strategy:<ul> <li>Random search - randomly pick a set of hyperparameter values</li> <li>Grid search - pick the next set of hyperparameter values, iterating through all sets across multiple experiments</li> </ul> </li> </ul> <pre><code>strategy = runai.hpo.Strategy.GridSearch\n</code></pre> <ul> <li>Call the Run:ai HPO library to specify a set of hyperparameters and pick a specific configuration for this experiment.</li> </ul> <pre><code>config = runai.hpo.pick(\ngrid=dict(\nbatch_size=[32, 64, 128],\nlr=[1, 0.1, 0.01, 0.001]),\nstrategy=strategy)\n</code></pre> <ul> <li>Use the returned configuration in your code. For example:</li> </ul> <pre><code>optimizer = keras.optimizers.SGD(lr=config['lr'])\n</code></pre> <p>Metrics could be reported and saved in the experiment directory under the fule <code>runai.yaml</code> using <code>runai.hpo.report</code>. You should pass the epoch number and a dictionary with metrics to be reported. For example:</p> <pre><code>runai.hpo.report(epoch=5, metrics={ 'accuracy': 0.87 })\n</code></pre>"},{"location":"Researcher/scheduling/hpo/#see-also","title":"See Also","text":"<ul> <li>See hyperparameter Optimization Quickstart</li> <li>Sample code in Github</li> </ul>"},{"location":"Researcher/scheduling/job-statuses/","title":"Job Statuses","text":""},{"location":"Researcher/scheduling/job-statuses/#introduction","title":"Introduction","text":"<p>The runai submit function and its sibling the runai submit-dist mpi function submit Run:ai Jobs for execution.</p> <p>A Job has a status. Once a Job is submitted it goes through several statuses before ending in an End State. Most of these statuses originate in the underlying Kubernetes infrastructure, but some are Run:ai-specific. </p> <p>The purpose of this document is to explain these statuses as well as the lifecycle of a Job. </p>"},{"location":"Researcher/scheduling/job-statuses/#successful-flow","title":"Successful Flow","text":"<p>A regular, training Job that has no errors and executes without preemption would go through the following statuses:</p> <p></p> <ul> <li>Pending - the Job is waiting to be scheduled.</li> <li>ContainerCreating - the Job has been scheduled, the Job docker image is now downloading.</li> <li>Running - the Job is now executing.</li> <li>Succeeded - the Job has finished with exit code 0 (success).</li> </ul> <p>The Job can be preempted, in which case it can go through other statuses:</p> <ul> <li>Terminating - the Job is now being preempted.</li> <li>Pending - the Job is waiting in queue again to receive resources.</li> </ul> <p>An interactive Job, by definition, needs to be closed by the Researcher and will thus never reach the Succeeded status. Rather, it would be moved by the Researcher to status Deleted.</p> <p>For a further explanation of the additional statuses, see the table below.</p>"},{"location":"Researcher/scheduling/job-statuses/#error-flow","title":"Error flow","text":"<p>A regular, training Job may encounter an error inside the running process (exit code is non-zero). In which case the following will happen:</p> <p></p> <p>The Job enters an Error status and then immediately tries to reschedule itself for another attempted run. The reschedule can happen on another node in the system. After a specified number of retries, the Job will enter a final status of Fail</p> <p>An interactive Job, enters an Error status and then moves immediately to CrashLoopBackOff trying to reschedule itself. The reschedule attempt has no 'back-off' limit and will continue to retry indefinitely </p> <p></p> <p>Jobs may be submitted with an image that cannot be downloaded. There are special statuses for such Jobs. See table below </p>"},{"location":"Researcher/scheduling/job-statuses/#status-table","title":"Status Table","text":"<p>Below is a list of statuses. For each status the list shows:</p> <ul> <li> <p>Name</p> </li> <li> <p>End State - this status is the final status in the lifecycle of the Job</p> </li> <li> <p>Resource Allocation - when the Job is in this status, does the system allocate resources to it</p> </li> <li> <p>Description</p> </li> <li> <p>Color - Status color as can be seen in the Run:ai User Interface Job list</p> </li> </ul> <p>Status</p> <p>End State</p> <p>Resource Allocation</p> <p>Description</p> <p>Color</p> <p>Running</p> <p></p> <p>Yes</p> <p>Job is running successfully</p> <p></p> <p>Terminating</p> <p></p> <p>Yes</p> <p>Pod is being evicted at the moment (e.g. due to an over-quota allocation, the reason will be written once eviction finishes). A new pod will be created shortly</p> <p></p> <p>ContainerCreating</p> <p></p> <p>Yes</p> <p>Image is being pulled from registry.</p> <p></p> <p>Pending</p> <p></p> <p>-</p> <p>Job is pending. Possible reasons:</p> <p>- Not enough resources</p> <p>- Waiting in Queue (over-quota etc).</p> <p></p> <p>Succeeded</p> <p>Yes</p> <p>-</p> <p>An Unattended (training) Job has ran and finished successfully.</p> <p></p> <p>Deleted</p> <p>Yes</p> <p>-</p> <p>Job has been deleted.</p> <p></p> <p>TimedOut</p> <p>Yes</p> <p>-</p> <p>Interactive Job has reached the defined timeout of the project.</p> <p></p> <p>Preempted</p> <p>Yes</p> <p>-</p> <p>Interactive preemptible Job has been evicted.</p> <p></p> <p>ContainerCannotRun</p> <p>Yes</p> <p>-</p> <p>Container has failed to start running. This is typically a problem within the docker image itself.</p> <p></p> <p>Error</p> <p></p> <p>Yes for interactive only </p> <p>The Job has returned an exit code different than zero. It is now waiting for another run attempt (retry).</p> <p></p> <p>Fail</p> <p>Yes</p> <p>-</p> <p>Job has failed after a number of retries (according to \"--backoffLimit\" field) and will not be trying again.</p> <p></p> <p>CrashLoopBackOff</p> <p></p> <p>Yes</p> <p>Interactive Only: During backoff after Error, before a retry attempt to run pod on the same node.</p> <p></p> <p>ErrImagePull, ImagePullBackOff</p> <p></p> <p>Yes</p> <p>Failing to retrieve docker image</p> <p></p> <p>Unknown</p> <p>Yes</p> <p>-</p> <p>The Run:ai Scheduler wasn't running when the Job has finished.</p> <p></p>"},{"location":"Researcher/scheduling/job-statuses/#how-to-get-more-information","title":"How to get more information","text":"<p>The system stores various events during the Job's lifecycle. These events can be helpful in diagnosing issues around Job scheduling. To view these events run:</p> <pre><code>runai describe job &lt;workload-name&gt;\n</code></pre> <p>Sometimes, useful information can be found by looking at  logs emitted from the process running inside the container. For example, Jobs that have exited with an exit code different than zero may write an exit reason in this log. To see Job logs run:</p> <pre><code>runai logs &lt;job-name&gt;\n</code></pre>"},{"location":"Researcher/scheduling/job-statuses/#distributed-training-mpi-jobs","title":"Distributed Training (mpi) Jobs","text":"<p>A distributed (mpi) Job, which has no errors will be slightly more complicated and has additional statuses associated with it. </p> <ul> <li> <p>Distributed Jobs start with an \"init container\" which sets the stage for a distributed run.</p> </li> <li> <p>When the init container finishes, the main \"launcher\" container is created. The launcher is responsible for coordinating between the different workers</p> </li> <li> <p>Workers run and do the actual work.</p> </li> </ul> <p>A successful flow of distribute training would look as:</p> <p></p> <p>Additional Statuses:</p> <p>Status</p> <p>End State</p> <p>Resource Allocation</p> <p>Description</p> <p>Color</p> <p>Init:&lt;number A&gt;/&lt;number B&gt;</p> <p></p> <p>Yes</p> <p>The Pod has B Init Containers, and A have completed so far.</p> <p></p> <p>PodInitializing</p> <p></p> <p>Yes</p> <p>The pod has finished executing Init Containers. The system is creating the main 'launcher' container</p> <p></p> <p>Init:Error</p> <p></p> <p></p> <p>An Init Container has failed to execute.</p> <p></p> <p>Init:CrashLoopBackOff</p> <p></p> <p></p> <p>An Init Container has failed repeatedly to execute</p> <p></p>"},{"location":"Researcher/scheduling/schedule-to-aws-groups/","title":"Scheduling workloads to AWS placement groups","text":"<p>Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.</p> <p>To enable and configure this feature:</p> <ol> <li>Press <code>Jobs | New job</code>.</li> <li>In <code>Scheduling and lifecycle</code> enable the <code>Topology aware scheduling</code>.</li> <li>In <code>Topology key</code>, enter the label of the topology of the node.</li> <li> <p>In <code>Scheduling rule</code> choose <code>Required</code> or <code>Preferred</code> from the drop down.</p> <ul> <li><code>Required</code>\u2014when enabled, all PODs must be scheduled to the same placement group.</li> <li><code>Preferred</code>\u2014when enabled, this is a best-effort, to place as many PODs on the same placement group.</li> </ul> </li> </ol>"},{"location":"Researcher/scheduling/strategies/","title":"Introduction","text":"<p>When the Run:ai scheduler schedules Jobs, it can use two alternate placement strategies:</p> Strategy Description Bin Packing Fill up a GPU or CPU and/or a node before moving on to the next one Spreading Equally spread Jobs amongst GPUs, CPUs and nodes"},{"location":"Researcher/scheduling/strategies/#bin-packing","title":"Bin Packing","text":"<p>Bin packing is the default strategy. With bin packing, the scheduler tries to:</p> <ul> <li>Fill up a node (GPUSs or CPUs) with Jobs before allocating Jobs to second and third nodes.</li> <li>In a multi GPU node, when using fractions, fill up a GPU before allocating Jobs to a second GPU.</li> </ul> <p>The advantage of this strategy is that the scheduler, over time, can package more Jobs into the cluster. As the strategy minimizes fragmentation.</p> <p>In a GPU node, for example, if we have 2 GPUs in a single node on the cluster, and 2 tasks requiring 0.5 GPUs each, using bin-packing, we would place both Jobs on the same GPU and remain with a full GPU ready for the next Job.</p> <p>In a CPU node, for example, if we have 4 CPUs in a single node on the cluster, and 2 tasks requiring 1 CPU each, using bin-packing, we would place both Jobs on the same node and still have more capacity for the next Job.</p>"},{"location":"Researcher/scheduling/strategies/#spreading","title":"Spreading","text":"<p>There are disadvantages to bin-packing:</p> <ul> <li>Within a single GPU, two fractional Jobs compete for the same onboard compute power.</li> <li>Within a single node, two Jobs (even on separate GPUs) compete for networking resources, compute power and memory.</li> </ul> <p>When there are more resources available than requested, it sometimes makes sense to spread Jobs amongst nodes and GPUs, to allow higher utilization of computing resources and network resources.</p> <p>Returning to the example above, if we have 2 GPUs in a single node on the cluster, and 2 Jobs requiring 0.5 GPUs each, using spread scheduling we would place each Job on a separate GPU, allowing both to benefit from the computing power of a full GPU.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/","title":"The Run:ai Scheduler","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#introduction","title":"Introduction","text":"<p>At the heart of the Run:ai solution is the Run:ai scheduler. The scheduler is the gatekeeper of your organization's hardware resources. It makes decisions on resource allocations according to pre-created rules.</p> <p>The purpose of this document is to describe the Run:ai scheduler and explain how resource management works.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#terminology","title":"Terminology","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#workload-types","title":"Workload Types","text":"<p>Run:ai differentiates between three types of deep learning workloads:</p> <ul> <li>Interactive build workloads. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads typically do not tax the GPU for a long duration. There are also typically real users behind an interactive workload that need an immediate scheduling response.</li> <li> <p>Unattended (or \"non-interactive\") training workloads. Training is characterized by a deep learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. Training workloads typically utilize large percentages of the GPU. During the execution, the Researcher can examine the results. A Training session can take anything from a few minutes to a couple of weeks. It can be interrupted in the middle and later restored.  It follows that a good practice for the Researcher is to save checkpoints and allow the code to restore from the last checkpoint.</p> </li> <li> <p>Inference workloads. These are production workloads that serve requests. The Run:ai scheduler treats these workloads as Interactive workloads.</p> </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#projects","title":"Projects","text":"<p>Projects are quota entities that associate a Project name with a deserved GPU quota as well as other preferences.</p> <p>A Researcher submitting a workload must associate a Project with any workload request. The Run:ai scheduler will then compare the request against the current allocations and the Project's deserved quota and determine whether the workload can be allocated with resources or whether it should remain in a pending state.</p> <p>For further information on Projects and how to configure them, see: Working with Projects</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#departments","title":"Departments","text":"<p>A Department is the second hierarchy of resource allocation above Project. A Department quota supersedes a Project quota in the sense that if the sum of Project quotas for Department A exceeds the Department quota -- the scheduler will use the Department quota rather than the Projects' quota.  </p> <p>For further information on Departments and how to configure them, see: Working with Departments</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#pods","title":"Pods","text":"<p>Pods are units of work within a Job. </p> <ul> <li>Typically, each Job has a single Pod. However, in some scenarios (see Hyperparameter Optimization and Distribute Training below) there will be multiple Pods per Job. </li> <li>All Pods execute with the same arguments as added via <code>runai submit</code>. E.g. The same image name, the same code script, the same number of Allocated GPUs, memory.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#basic-scheduling-concepts","title":"Basic Scheduling Concepts","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#interactive-training-and-inference","title":"Interactive, Training and Inference","text":"<p>The Researcher uses the --interactive flag to specify whether the workload is an unattended \"train\" workload or an interactive \"build\" workload.</p> <ul> <li>Interactive &amp; Inference workloads will get precedence over training workloads.</li> <li>Training workloads can be preempted when the scheduler determines a more urgent need for resources. Interactive workloads are never preempted.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#guaranteed-quota-and-over-quota","title":"Guaranteed Quota and Over-Quota","text":"<p>There are two use cases for Quota and Over-Quota:</p> <p>Node pools are disabled</p> <p>Every new workload is associated with a Project. The Project contains a deserved GPU quota. During scheduling:</p> <ul> <li>If the newly required resources, together with currently used resources, end up within the Project's quota, then the workload is ready to be scheduled as part of the guaranteed quota.</li> <li>If the newly required resources together with currently used resources end up above the Project's quota, the workload will only be scheduled if there are 'spare' GPU resources. There are nuances in this flow that are meant to ensure that a Project does not end up with an over-quota made fully of interactive workloads. For additional details see below.</li> </ul> <p>Node pools are enabled</p> <p>Every new workload is associated with a Project. The Project contains a deserved GPU quota that is the sum off all node pools GPU quotas. During scheduling:</p> <ul> <li>If the newly required resources, together with currently used resources, end up within the overall Project's quota and the requested node pool(s) quota, then the workload is ready to be scheduled as part of the guaranteed quota.</li> <li>If the newly required resources together with currently used resources end up above the Project's quota or the requested node pool(s) quota, the workload will only be scheduled if there are 'spare' GPU resources within the same node pool but not part of this Project. There are nuances in this flow that are meant to ensure that a Project does not end up with an over-quota made entirely of interactive workloads. For additional details see below.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#quota-with-multiple-resources","title":"Quota with Multiple Resources","text":"<p>A project may have a quota set for more than one resource (GPU, CPU or CPU Memory). For a project to be \"Over-quota\" it will have to have at least one resource over its quota. For a project to be under-quota it needs to have all of its resources under-quota.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-details","title":"Scheduler Details","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#allocation-preemption","title":"Allocation &amp; Preemption","text":"<p>The Run:ai scheduler wakes up periodically to perform allocation tasks on pending workloads:</p> <ul> <li>The scheduler looks at each Project separately and selects the most 'deprived' Project.</li> <li> <p>For this deprived Project it chooses a single workload to work on:</p> <ul> <li>Interactive &amp; Inference workloads are tried first, but only up to the Project's guaranteed quota. If such a workload exists, it is scheduled even if it means preempting a running unattended workload in this Project.</li> <li>Else, it looks for an unattended workload and schedules it on guaranteed quota or over-quota.</li> </ul> </li> <li> <p>The scheduler then recalculates the next 'deprived' Project and continues with the same flow until it finishes attempting to schedule all workloads</p> </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#node-pools","title":"Node Pools","text":"<p>A Node Pool is a set of nodes grouped by an Administrator into a distinct group of resources from which resources can be allocated to Projects and Departments. By default, any node pool created in the system is automatically associated with all Projects and Departments using zero quota resource (GPUs, CPUs, Memory) allocation. This allows any Project and Department to use any node pool with Over-Quota (for Preemptible workloads), thus maximizing the system resource utilization.</p> <ul> <li>An Administrator can allocate resources from a specific node pool to chosen Projects and Departments. See Project Setup</li> <li>The Researcher can use node pools in two ways. The first one is where a Project has guaranteed resources on node pools - The Researcher can then submit a workload and specify a single node pool or a prioritized list of node pools to use and receive guaranteed resources.  The second is by using node-pool(s) with no guaranteed resource for that Project (zero allocated resources), and in practice using Over-Quota resources of node-pools. This means a Workload must be Preemptible as it uses resources out of the Project or node pool quota. The same scenario occurs if a Researcher uses more resources than allocated to a specific node pool and goes Over-Quota.</li> <li>By default, if a Researcher doesn't specify a node-pool to use by a workload, the scheduler assigns the workload to run using the Project's 'Default node-pool list'.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#node-affinity","title":"Node Affinity","text":"<p>Both the Administrator and the Researcher can provide limitations as to which nodes can be selected for the Job. Limits are managed via Kubernetes labels:</p> <ul> <li>The Administrator can set limits at the Project level. Example: Project <code>team-a</code> can only run <code>interactive</code> Jobs on machines with a label of <code>v-100</code> or <code>a-100</code>. See Project Setup for more information.</li> <li>The Researcher can set a limit at the Job level, by using the command-line interface flag <code>--node-type</code>. The flag acts as a subset to the Project setting. </li> </ul> <p>Node affinity constraints are used during the Allocation phase to filter out candidate nodes for running the Job. For more information on how nodes are filtered see the <code>Filtering</code> section under Node selection in kube-scheduler. The Run:ai scheduler works similarly.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim","title":"Reclaim","text":"<p>During the above process, there may be a pending workload whose Project is below the deserved capacity. Still, it cannot be allocated due to the lack of GPU resources. The scheduler will then look for alternative allocations at the expense of another Project which has gone over-quota while preserving fairness between Projects.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#fairness","title":"Fairness","text":"<p>The Run:ai scheduler determines fairness between multiple over-quota Projects according to their GPU quota. Consider for example two Projects, each spawning a significant amount of workloads (e.g. for Hyperparameter tuning) all of which wait in the queue to be executed. The Run:ai Scheduler allocates resources while preserving fairness between the different Projects regardless of the time they entered the system. The fairness works according to the relative portion of the GPU quota for each Project. To further illustrate that, suppose that:</p> <ul> <li>Project A has been allocated a quota of 3 GPUs.</li> <li>Project B has been allocated a quota of 1 GPU.</li> </ul> <p>Then, if both Projects go over-quota, Project A will receive 75% (=3/(1+3)) of the idle GPUs and Project B will receive 25% (=1/(1+3)) of the idle GPUs. This ratio will be recalculated every time a new Job is submitted to the system or an existing Job ends.</p> <p>This fairness equivalence will also be maintained amongst running Jobs. The scheduler will preempt training sessions to maintain this equivalence </p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota-priority","title":"Over-Quota Priority","text":"<p>When the Over-quota Priority feature is enabled, The Run:ai scheduler allocates GPUs within-quota and over-quota using different weights. Within quota, GPUs are allocated based on assigned GPUs. The remaining over-quota GPUs are allocated based on their relative portion of GPU Over-quota Priority for each Project.  GPUs Over-Quota Priority values are translated into numeric values as follows: None-0, Low-1, Medium-2, High-3.</p> <p>Let's examine the previous example with Over-Quota Weights:</p> <ul> <li>Project A has been allocated with a quota of 3 GPUs and GPU over-quota weight is set to Low.</li> <li>Project B has been allocated with a quota of 1 GPU and GPU over-quota weight is set to High.</li> </ul> <p>Then, Project A is allocated with 3 GPUs and project B is allocated with 1 GPU. If both Projects go over-quota, Project A will receive an additional 25% (=1/(1+3)) of the idle GPUs and Project B will receive an additional 75% (=3/(1+3)) of the idle GPUs.</p> <p>With the addition of node pools, the principles of Over-Quota and Over-Quota priority remain unchanged. However, the number of resources that are allocated with Over-Quota and Over-Quota Priority is calculated against node pool resources instead of the whole Project resources.</p> <ul> <li>Note: Over-Quota On/Off and Over-Quota Priority settings remain at the Project and Department level.  </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#bin-packing-consolidation","title":"Bin-packing &amp; Consolidation","text":"<p>Part of an efficient scheduler is the ability to eliminate fragmentation:</p> <ul> <li>The first step in avoiding fragmentation is bin packing: try and fill nodes (machines) up before allocating workloads to new machines.</li> <li>The next step is to consolidate Jobs on demand. If a workload cannot be allocated due to fragmentation, the scheduler will try and move unattended workloads from node to node in order to get the required amount of GPUs to schedule the pending workload.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#advanced","title":"Advanced","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#gpu-fractions","title":"GPU Fractions","text":"<p>Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU.</p> <p>Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. </p> <p>One important thing to note is that fraction scheduling divides up GPU memory. As such the GPU memory is divided up between Jobs. If a Job asks for 0.5 GPU, and the GPU has 32GB of memory, then the Job will see only 16GB. An attempt to allocate more than 16GB will result in an out-of-memory exception.</p> <p>GPU Fractions are scheduled as regular GPUs in the sense that:</p> <ul> <li>Allocation is made using fractions such that the total of the GPU allocation for a single GPU is smaller or equal to 1.</li> <li>Preemption is available for non-interactive workloads.  </li> <li>Bin-packing &amp; Consolidation work the same for fractions.</li> </ul> <p>Support: </p> <ul> <li>Hyperparameter Optimization supports fractions. </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#distributed-training","title":"Distributed Training","text":"<p>Distributed Training, is the ability to split the training of a model among multiple processors. It is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Each such split is a pod (see definition above). Run:ai spawns an additional launcher process that manages and coordinates the other worker pods.</p> <p>Distribute Training utilizes a practice sometimes known as Gang Scheduling:</p> <ul> <li>The scheduler must ensure that multiple pods are started on what are typically multiple Nodes before the Job can start. </li> <li>If one pod is preempted, the others are also immediately preempted.</li> <li>When node pools are enabled, all pods must be scheduled to the same node pool.</li> </ul> <p>Gang Scheduling essentially prevents scenarios where part of the pods are scheduled while other pods belonging to the same Job are pending for resources to become available; scenarios that can cause deadlock situations and major inefficiencies in cluster utilization. </p> <p>The Run:ai system provides:</p> <ul> <li>Inter-pod communication. </li> <li>Command-line interface to access logs and an interactive shell. </li> </ul> <p>For more information on Distributed Training in Run:ai see here</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#hyperparameter-optimization","title":"Hyperparameter Optimization","text":"<p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process, to define the model architecture or the data pre-processing process, etc. Example hyperparameters: learning rate, batch size, different optimizers, and the number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine the results to decide what works best.</p> <p>With HPO, the Researcher provides a single script that is used with multiple, varying, parameters. Each run is a pod (see definition above). Unlike Gang Scheduling, with HPO, pods are independent. They are scheduled independently, started, and end independently, and if preempted, the other pods are unaffected. The scheduling behavior for individual pods is exactly as described in the Scheduler Details section above for Jobs.  In case node pools are enabled, if the HPO workload has been assigned with more than one node pool, the different pods might end up running on different node pools. </p> <p>For more information on Hyperparameter Optimization in Run:ai see here</p>"},{"location":"Researcher/scheduling/using-node-pools/","title":"Introduction","text":"<p> Version 2.8 and up.</p> <p>Node pools assist in managing heterogeneous resources effectively. A node pool is a set of nodes grouped into a bucket of resources using a predefined (e.g. GPU-Type) or administrator-defined label (key &amp; value). Typically, those nodes share a common feature or property, such as GPU type or other HW capability (such as Infiniband connectivity) or represent a proximity group (i.e. nodes interconnected via a local ultra-fast switch). Those nodes would typically be used by researchers to run specific workloads on specific resource types, or by MLops engineers to run specific Inference workloads that require specific node types.</p>"},{"location":"Researcher/scheduling/using-node-pools/#enabling-node-pools","title":"Enabling Node-Pools","text":"<p>The \u2018Node Pools\u2019 feature is disabled by default:</p> <ul> <li>To use node pools - enable this feature under <code>Settings</code> | <code>General</code>. Turn on <code>Enable Node Pools</code>.</li> <li>To manage CPU resources - enable this feature under  <code>Settings</code> | <code>General</code>. Turn on <code>Enable CPU Resources Quota</code>.</li> </ul> <p>Once the feature is enabled by the administrator, all nodes in each of your upgraded clusters are associated with the <code>Default</code> node pool.</p>"},{"location":"Researcher/scheduling/using-node-pools/#creating-and-using-node-pools","title":"Creating and using Node-Pools","text":"<p>An administrator creates logical groups of nodes by specifying a unique label (key &amp; value) and associating it with a node pool. Run:ai allows an administrator to use any label key and value as the designated node-pool label (e.g. <code>gpu-type = A100</code> or <code>faculty = computer-science</code>). Each node pool has a unique name and label used to identify and group nodes into a node pool. Once a new node pool is created, it is automatically assigned to all Projects and Departments with a quota of zero GPU resources and CPU resources. This allows any Project and Department to use any node pool when over-quota is enabled, even if the administrator has not assigned a quota for a specific node pool in a Project or Department.</p> <p>Using resources with over-quota means these resources might be reclaimed by other Projects or Departments that have an assigned quota in place for those node pools. On the other hand, this pattern allows for maximizing the utilization of GPU and CPU resources by the system. An administrator should assign resources from a node pool to a project for which the administrator wants to guarantee reserved resources on that node pool. The reservation should be done for GPU resources and CPU resources. Projects and Departments with no reserved resources for a specific node pool can still use node pool resources, but the resources are not reserved and can be reclaimed by the resources owner Project (or Department).</p> <p>Creating a new node pool and assigning resources from a node pool to Projects and Departments is an operation limited to Administrators only. Researchers can use node pools when submitting a new workload. By specifying the node pool from which a workload allocates resources, the scheduler shell launch that workload on a node that is part of the specified node pool. If no node-pool is selected by a workload, the \u2018Default\u2019 node-pool is used.</p>"},{"location":"Researcher/scheduling/using-node-pools/#creating-new-node-pools","title":"Creating new node pools","text":"<p>To create a node pool:</p> <ol> <li>From the left menu select Nodes then Node Pools.</li> <li>Press New Nodepool</li> <li>Enter a name, label, and value for the node pool.</li> <li>Select a GPU or CPU placement strategy. Press Save when complete.</li> </ol> <p>To assign nodes to a node pool:</p> <ol> <li>Get the list of nodes and their current labels using the following command:</li> </ol> <pre><code>kubectl get nodes --show-labels\n</code></pre> <ol> <li>Annotate a specific node with a new label using the following command:</li> </ol> <pre><code>kubectl label node &lt;node-name&gt; &lt;key&gt;=&lt;value&gt;\n</code></pre> <p>Note</p> <ul> <li>You can annotate multiple nodes with the same label.</li> </ul> <p>To create a node pool with the chosen common label use the create node pool Run:ai API.</p>"},{"location":"Researcher/scheduling/using-node-pools/#multiple-node-pools-selection","title":"Multiple Node Pools Selection","text":"<p> Version 2.9 and up</p> <p>Starting version 2.9, Run:ai system supports scheduling workloads to a node pool using a list of prioritized node pools. The scheduler will try to schedule the workload to the most prioritized node pool first, if it fails, it will try the second one and so forth. If the scheduler tried the entire list and failed to schedule the workload, it will start from the most prioritized node pool again. This pattern allows for maximizing the odds that a workload will be scheduled.</p>"},{"location":"Researcher/scheduling/using-node-pools/#defining-project-level-default-node-pool-priority-list","title":"Defining Project level 'default node pool priority list'","text":"<p>If the Researcher did not specify any node pool within the workload specification, the system will use the default node pool priority list as defined by the administrator. If the administrator did not define the *default node pool priority list_, the system will use the <code>Default</code> node pool.</p>"},{"location":"Researcher/scheduling/using-node-pools/#node-pools-best-practices","title":"Node-Pools Best Practices","text":"<p>Node pools give administrators the ability to manage quotas in a more granular manner than the Project level, allowing them to specify which Projects are assigned guaranteed resources on specific sets of nodes to be then used by Workloads that need specific node characteristics. Any Project can use any node pool, even if a quota was not assigned to the Node-Pool, it can still be used in an Over-Quota manner.</p> <p>As a rule of thumb, it is best for the administrator to split the organization's GPU deployment to the smallest number of node pools that still serves its purpose, this would help in keeping each pool large enough and minimize the probability that the Run:ai scheduler would not be able to find available resources on a specific node-pool.</p> <p>It is a good practice for researchers to use multiple node pools where applicable, to maximize their workload odds to get scheduled promptly or in cases where resources are scarce in a specific node pool.</p> <p>Administrators should set Projects' default node pool priority list' to make sure that in case a workload was scheduled with no node pool selection, it is scheduled to the preferences of the Administrator, and to increase the workload's odds to get scheduled and promptly.</p>"},{"location":"Researcher/scheduling/using-node-pools/#common-use-cases","title":"Common use-cases","text":"<ul> <li>Training workloads that require specific GPU-type nodes, either because of the scale of parameters (computation time) or for other specific GPU capabilities</li> <li>Inference workloads that require specific GPU-type nodes to comply with constraints such as execution time</li> <li>Workloads that require proximity of nodes for purposes of local ultra-fast networking</li> <li>Organizations where specific nodes belong to specific a  department, and while assuring quota for that department and its subordinated projects, the administrator also wants to let other departments and projects use those nodes when not used by the resource owner</li> <li>Projects that need to use specific resources, but also ensure others will not occupy those resources</li> </ul> <p>While the upper use cases refer to a single node pool, they are also applicable to multiple node pools. In cases where a workload's specification could be satisfied by more than one type of node, using multiple node pool selection potentially increases the probability of a workload finding resources to allocate and shortening the time it will take to get those resources.</p>"},{"location":"Researcher/tools/dev-jupyter/","title":"Use a Jupyter Notebook with a Run:ai Job","text":"<p>A Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code. Uses include data cleaning and transformation, numerical simulation, statistical modeling, data visualization, machine learning, and much more. Jupyter Notebooks are popular with Researchers as a way to code and run deep-learning code. A Jupyter Notebook runs inside the user container. </p> <p>This document is about accessing the remote container created by Run:ai via such a notebook. Alternatively, Run:ai provides integration with JupyterHub. JupyterHub is a separate service that makes it possible to serve pre-configured data science environments. For more information see Connecting JupyterHub with Run:ai.</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-a-jupyter-notebook-workload","title":"Submit a Jupyter Notebook Workload","text":"<p>There are two ways to submit a Jupyter Notebook Job: via the Command-line interface or the user interface</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-via-the-user-interface","title":"Submit via the User interface","text":"<ul> <li>Within the user interface go to the Job list.</li> <li>Select <code>New Job</code> on the top right.</li> <li>Select <code>Interactive</code> at the top. </li> <li>Add an image that supports Jupyter Notebook. For example <code>jupyter/scipy-notebook</code>.</li> <li>Select the <code>Jupyter Notebook</code> button.</li> </ul> <p>Submit the Job. When running, select the job and press <code>Connect</code> on the top right.</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-a-workload","title":"Submit a Workload","text":"<p>Run the following command to connect to the Jupyter Notebook container as if it were running locally:</p> <pre><code>runai submit build-jupyter --jupyter -g 1\n</code></pre> <p>The terminal will show the following: </p> <pre><code>~&gt; runai submit build-jupyter --jupyter -g 1 --attach\nINFO[0001] Exposing default jupyter notebook port 8888\nINFO[0001] Using default jupyter notebook image \"jupyter/scipy-notebook\"\nINFO[0001] Using default jupyter notebook service type portforward\nThe job 'build-jupyter' has been submitted successfully\nYou can run `runai describe job build-jupyter -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0081] Job started\nJupyter notebook token: 428dc561a5431bd383eff17714460de478d673deec57c045\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -&gt; 8888\nForwarding from [::1]:8888 -&gt; 8888\n</code></pre> <ul> <li>The Job starts a Jupyter notebook container.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 8888</li> </ul> <p>Browse to http://localhost:8888. Use the token in the output to log into the notebook. </p>"},{"location":"Researcher/tools/dev-jupyter/#alternatives","title":"Alternatives","text":"<p>The above flag <code>--jupyter</code> is a shortcut with a predefined image. If you want to run your own notebook, use the quickstart on running a build workload with connected ports. </p>"},{"location":"Researcher/tools/dev-pycharm/","title":"Use PyCharm with a Run:ai Job","text":"<p>Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook</p> <p>This document is about accessing the remote container created by Run:ai, from JetBrain's PyCharm. </p>"},{"location":"Researcher/tools/dev-pycharm/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/pycharm-demo</code>. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection: </p> <pre><code>The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul> <p>Note<p>It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The Job redirects the external port 30022 to port 22 and uses a Node Port service type.</li> <li> <p>Run: <code>runai list worklaods</code></p> </li> <li> <p>Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222 </p> </li> </ul> </p>"},{"location":"Researcher/tools/dev-pycharm/#pycharm","title":"PyCharm","text":"<ul> <li>Under PyCharm | Preferences go to: Project | Python Interpreter </li> <li>Add a new SSH Interpreter. </li> <li>As Host, use the IP address above. Change the port to the above and use the Username <code>root</code></li> <li>You will be prompted for a password. Enter <code>root</code></li> <li>Apply settings and run the code via this interpreter. You will see your project uploaded to the container and running remotely. </li> </ul>"},{"location":"Researcher/tools/dev-tensorboard/","title":"Connecting to TensorBoard","text":"<p>Once you launch a Deep Learning workload using Run:ai, you may want to view its progress. A popular tool for viewing progress is TensorBoard.</p> <p>The document below explains how to use TensorBoard to view the progress or a Run:ai Job.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-a-workload","title":"Submit a Workload","text":"<p>When you submit a workload, your workload must save TensorBoard logs which can later be viewed. Follow this document on how to do this. You can also view the Run:ai sample code here.</p> <p>The code shows:</p> <ul> <li>A reference to a log directory:</li> </ul> <pre><code>log_dir = \"logs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n</code></pre> <ul> <li>A registered Keras callback for TensorBoard:</li> </ul> <pre><code>tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)\nmodel.fit(x_train, y_train,\n....\ncallbacks=[..., tensorboard_callback])\n</code></pre> <p>The <code>logs</code> directory must be saved on a Network File Server such that it can be accessed by the TensorBoard Job. For example, by running the Job as follows:</p> <pre><code>runai submit train-with-logs -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n</code></pre> <p>Note the volume flag (<code>-v</code>) and working directory flag (<code>--working-dir</code>). The logs directory will be created on <code>/mnt/nfs_share/john/logs/fit</code>.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-a-tensorboard-workload","title":"Submit a TensorBoard Workload","text":"<p>There are two ways to submit a TensorBoard Workload: via the Command-line interface or the user interface</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-via-the-user-interface","title":"Submit via the User interface","text":"<ul> <li>Within the user interface go to the Job list.</li> <li>Select <code>New Job</code> on the top right.</li> <li>Select <code>Interactive</code> at the top. </li> <li>Add an image that supports TensorBoard. For example: <code>tensorflow/tensorflow:latest</code>.</li> <li>Select the <code>TensorBoard</code> button.</li> <li>Add a mounted volume on which TensorBoard logs exist. The example above uses <code>/mnt/nfs_share/john</code>. Map to <code>/mydir</code></li> <li>Add <code>/mydir</code> to the <code>TensorBoard Logs Directory</code>. </li> </ul> <p>Submit the Job. When running, select the job and press <code>Connect</code> on the top right.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-via-the-command-line-interface","title":"Submit via the Command-line interface","text":"<p>Run the following:</p> <pre><code>runai submit tb -i tensorflow/tensorflow:latest --interactive --service-type=portforward --port 8888:8888  --working-dir /mydir  -v /mnt/nfs_share/john:/mydir  -- tensorboard --logdir logs/fit --port 8888 --host 0.0.0.0\n</code></pre> <p>The terminal will show the following: </p> <pre><code>The job 'tb' has been submitted successfully\nYou can run `runai describe job tb -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nINFO[0014] Job started\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -&gt; 8888\nForwarding from [::1]:8888 -&gt; 8888\n</code></pre> <p>Browse to http://localhost:8888/ to view TensorBoard.</p> <p>Note</p> <p>A single TensorBoard Job can be used to view multiple deep learning Jobs, provided it has access to the logs directory for these Jobs. </p> <p>You can also submit a TensorBoard Job via the user interface. In which case, instead of <code>portforward</code> you will need to select a different service type. If the URL to the TensorBoard job includes a path, you may need to use the TensorBoard flag <code>--path_prefix</code>. For example, if your access point is acme.com/tensorboard1 add  <code>--path_prefix /tensorboard1</code>.</p>"},{"location":"Researcher/tools/dev-vscode/","title":"Use Visual Studio Code with a Run:ai Job","text":"<p>Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook</p> <p>This document is about accessing the remote container created by Run:ai, from Visual Studio Code. </p>"},{"location":"Researcher/tools/dev-vscode/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/pycharm-demo</code>. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection: </p> <pre><code>The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul> <p>Note<p>It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The Job redirects the external port 30022 to port 22 and uses a Node Port service type.</li> <li> <p>Run: <code>runai list jobs</code></p> </li> <li> <p>Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222 </p> </li> </ul> </p>"},{"location":"Researcher/tools/dev-vscode/#visual-studio-code","title":"Visual Studio Code","text":"<ul> <li>Under Visual Studio code install the Remote SSH extension.</li> <li>Create an ssh entry to the service by editing .ssh/config file or use the command Remote-SSH: Connect to Host... from the Command Palette.  Enter the IP address and port from above (e.g. ssh root@35.34.212.12 -p 30022 or ssh root@127.0.0.1 -p 2222). User and password are <code>root</code> </li> <li>Using VS Code, install the Python extension on the remote machine </li> <li>Write your first python code and run it remotely.</li> </ul>"},{"location":"Researcher/tools/dev-x11forward-pycharm/","title":"Use PyCharm with X11 Forwarding and Run:ai","text":"<p>X11 is a window system for the Unix operating systems. X11 forwarding allows executing a program remotely through an SSH connection. Meaning, the executable file itself is hosted on a different machine than where the graphical interface is being displayed. The graphical windows are forwarded to your local machine through the SSH connection.</p> <p>This section is about setting up X11 forwarding from a Run:ai-based container to a PyCharm IDE on a remote machine.</p>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/quickstart-x-forwarding</code>. The image runs:</p> <ul> <li>Python</li> <li>SSH Daemon configured for X11Forwarding </li> <li>OpenCV python library for image handling</li> </ul> <p>Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit xforward-remote -i gcr.io/run-ai-demo/quickstart-x-forwarding --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection:</p> <pre><code>The job 'xforward-remote' has been submitted successfully\nYou can run `runai describe job xforward-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#setup-the-x11-forwarding-tunnel","title":"Setup the X11 Forwarding Tunnel","text":"<p>Connect to the new Job by running:</p> <pre><code>ssh -X root@127.0.0.1 -p 2222\n</code></pre> <p>Note the <code>-X</code> flag. </p> <p>Run:</p> <p><pre><code>echo $DISPLAY\n</code></pre> Copy the value. It will be used as a PyCharm environment variable.</p> <p>Important</p> <p>The ssh terminal should remain active throughout the session.</p>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#pycharm","title":"PyCharm","text":"<ul> <li>Under PyCharm | Preferences go to: Project | Python Interpreter</li> <li>Add a new SSH Interpreter.</li> <li>As Host, use <code>localhost</code>. Change the port to the above (<code>2222</code>) and use the Username <code>root</code>.</li> <li>You will be prompted for a password. Enter <code>root</code>.</li> <li>Make sure to set the correct path of the Python binary. In our case it's <code>/usr/local/bin/python</code>.</li> <li> <p>Apply your settings.</p> </li> <li> <p>Under PyCharm configuration set the following environment variables:</p> <ol> <li><code>DISPLAY</code> - set environment variable you copied before</li> <li><code>HOME</code> - In our case it's <code>/root</code>. This is required for the X11 authentication to work.</li> </ol> </li> </ul> <p>Run your code. You can use our sample code here.</p>"},{"location":"Researcher/user-interface/trainings/","title":"Trainings","text":"<p>The Trainings interface provides a wizard to make submitting jobs easy.</p>"},{"location":"Researcher/user-interface/trainings/#prerequisites","title":"Prerequisites","text":"<p>You must have:</p> <ul> <li>Workspaces enabled.</li> <li>At least one Project configured.</li> </ul> <p>Note</p> <p>See your system administrator to ensure the prerequisites are enabled and configured.</p>"},{"location":"Researcher/user-interface/trainings/#adding-trainings","title":"Adding Trainings","text":"<p>Note</p> <p>Where there is a card gallery, use the search bar to find specific cards based on title or field values.</p> <p>To add a training:</p> <ol> <li>Press Tranings in the menu.</li> <li>In the Projects pane, select the destination project. Use the search box to find projects that are not listed. If you can't find the project, see your system administrator.</li> <li>In the Templates pane, select a template from the list. Use the search box to find templates that are not listed. If you can't find the specific template you need, see your system administrator.</li> <li>In the Training name pane, enter a name for the Traninng, then press continue.</li> <li>In the Environment pane select or create a new environment. Use the search box to find environments that are not listed.</li> <li>In the Compute resource pane, select resources for your tranings or create a new compute resource. Use the search box to find resources that are not listed. Press More settings to use Node Affinity to limit the resources to a specific node.</li> <li>In the Data sources pane, press add a new data source. For more information, see Creating a new data source When complete press, Create Data Source.</li> <li>When complete, press Create training.</li> </ol>"},{"location":"Researcher/user-interface/trainings/#managing-trainings","title":"Managing Trainings","text":"<p>The Trainings list contains a list of training jobs that you have created or have access to.</p> <p>To manage your trainings:</p> <ol> <li>Press the 1. Press Tranings in the menu.</li> <li>Select a Training from the list.</li> <li>Choose from the following actions:<ul> <li>Activate\u2014activates the selected training job.</li> <li>Stop\u2014stops the selected training job.</li> <li>Connect\u2014connects to the training job's configured environment.</li> <li>Copy &amp; edit\u2014copies the details of the selected training job to a new training job.</li> <li>Delete\u2014deletes the current training session.</li> <li>Show details\u2014displays details about the training job.</li> </ul> </li> </ol>"},{"location":"Researcher/user-interface/trainings/#training-details","title":"Training details","text":"<p>Training details are displayed using the Show details action. The details available per training job include;</p> <ul> <li>Event hostory\u2014a graph of the job's status over time along with a list of events found in the log.</li> <li> <p>Metrics\u2014a graph of available metrics for the job. Use the drop down select a date and a time slice. Metrics include:</p> <ul> <li>GPU utilization</li> <li>GPU memory useage</li> <li>CPU useage</li> <li>CPU memory useage</li> </ul> </li> <li> <p>Logs\u2014a log file of the current status. Use the download button to save the logs.</p> </li> </ul> <p>To hide the training details, press Hide details.</p>"},{"location":"Researcher/user-interface/workspaces/overview/","title":"Getting familiar with workspaces","text":"<p> Version 2.9</p> <p>Workspace is a simplified tool for researchers to conduct experiments, build AI models, access standard MLOps tools, and collaborate with their peers.</p> <p>Run:ai workspaces abstract complex concepts related to running containerized workloads in a Kubernetes environment. Aspects such as networking, storage, and secrets, are built from predefined abstracted setups, that ease and streamline the researcher's AI model development.</p> <p>A workspace consists of all the setup and configuration needed for the research, including container images, data sets, resource requests, as well as all required tools for the research, in a single place.  This setup is set to facilitate the research needs and yet to ensure infrastructure owners keep control and efficiency when supporting the various needs.</p> <p>A workspace is associated with a specific Run:ai project (internally: a Kubernetes namespace). A researcher can create multiple workspaces under a specific project.</p> <p>Researchers can only view and use workspaces that are created under projects they are assigned to.</p> <p></p> <p>Workspaces can be created with just a few clicks of a button. See Workspace creation.  </p> <p>Workspaces can be stopped and started to save expensive resources without losing complex environment configurations.</p> <p>Only when a workspace is in status active (see also Workspace Statuses) does it consume resources. </p> <p>When the workspace is active it exposes the connections to the tools (for example, a Jupyter notebook) within the workspace. </p> <p></p> <p>An active workspace is a Run:ai interactive workload. The interactive workload starts when the workspace is started and stopped when the workspace is stopped. </p> <p>Workspaces can be used via the user interface or programmatically via the Run:ai Admin API. Workspaces are not supported via the command line interface. You can still run an interactive workload via the command line. </p>"},{"location":"Researcher/user-interface/workspaces/overview/#next-steps","title":"Next Steps","text":"<ul> <li>Workspaces are made from building blocks. Read about the various building blocks</li> <li>See how to create a Workspace.  </li> </ul>"},{"location":"Researcher/user-interface/workspaces/statuses/","title":"Workspace Statuses","text":"<p>The Workspace\u2019s status mechanism displays the state of the workspace by aggregating various Kubernetes statuses into the following list:</p> Status Description Pending The workspace is waiting in queue and does not consume any resources. Initializing The workspace has been scheduled and it is consuming resources. Active The workspace is ready to be used and allows the researcher to connect. Stopped The workspace is currently unused and does not consume any resources Failed Something went wrong and the workspace is not usable. <p>This allows the researcher to quickly understand whether the workspace is ready to use and if resources are allocated to it. You can hover over the status column to see additional details about the workspace status.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/statuses/#pending-workspace","title":"Pending workspace","text":"<p>The Pending status indicates that the workspace is waiting in queue and does not consume any resources. The workspace will always end up in this state if the workspace was successfully activated but the relevant resources are unavailable.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#initializing-workspace","title":"Initializing workspace","text":"<p>The Initializing status indicates that the workspace has been scheduled and is consuming resources. However, it is not active yet as its container is still initializing (so it is not possible to connect to the container tools). This step can take anything from a few seconds to a couple of minutes depending on several factors such as the image size to be pulled. The workspace always goes through this state before the workspace turns active.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#active-workspace","title":"Active workspace","text":"<p>The Active status indicates that the workspace is ready to be used and allows the researcher to connect to its tools. At this status, the workspace is consuming resources and affecting the project\u2019s quota. The workspace will turn to active status once the <code>Active</code> button is pressed, the activation process ends up successfully and relevant resources are available and vacant.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#stopped-workspace","title":"Stopped workspace","text":"<p>The Stopped status indicates that the workspace is currently unused and does not consume any resources. A workspace can be stopped either manually, or automatically if triggered by idleness criteria set by the admin (see Limit duration of interactive Jobs).</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#failed-workspace","title":"Failed workspace","text":"<p>The Failed status indicates that something went wrong and the workspace is not usable. You must recreate the workspace and try again.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#transitioning-states","title":"Transitioning states","text":"<p>When the user attempts to delete, stop, or activate a workspace, the status column indicates a transition state which will either be successful or will fail. If the action fails, the workspace will stay in its original status. For example, if the user tries to delete an active workspace and fails, the workspace is left in active status. Transitioning states are only visible in the browser of the user.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/blocks/building-blocks/","title":"Workspace Building Blocks","text":"<p>Workspace building blocks are a layer that abstracts complex containers and Kubernetes concepts and provides simple and reusable tools to quickly allocate resources to the workspace. This way researchers need to interact only with the building blocks, and do not need to be aware of technical setups and configurations.</p> <p>Workspaces are built from the following building blocks:</p> <ol> <li>Environment</li> <li>Data source</li> <li>Compute resource</li> </ol> <p></p> <p>When a workspace is created, the researcher chooses from preconfigured building blocks or can create a new one on the fly. For example, a workspace can be composed of the following blocks:</p> <ul> <li>Environment: Jupyter, Tensor Board and Cuda 11.2</li> <li>Compute resource: 0.5 GPU, 8 cores and 200 Megabytes of CPU memory</li> <li>Data source: A Git branch with the relevant dataset needed</li> </ul> <p></p> <p>A building block has a scope. The scope links a building block to a specific Run:ai project or to all projects:   </p> <ul> <li>When a building block scope is a specific project. It can be viewed and used only within the project.</li> <li>A building block scope can also be set to all projects (current projects and also any future ones).</li> </ul> <p></p> <p>Typically, building blocks are created by the administrator and then assigned to a project. You can grant permission to the researchers to create their own building blocks. These building blocks will only be available to the projects that are assigned to the researcher that created them.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/building-blocks/#next-steps","title":"Next Steps","text":"<p>Read about the various building blocks Environments, Compute Resources and Data Sources.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/compute/","title":"Compute resource introduction","text":"<p>A compute resource building block represents a resource request to be used by the workspace (for example 0.5 GPU, 8 cores and 200 Megabytes of CPU memory). When a workspace is activated, the scheduler looks for a node that can fullfil the request. </p> <p>The compute resource is a mandatory building block for Workspace. A request is composed of the following resources: </p> <ul> <li>GPU resources</li> <li>CPU memory resources</li> <li>CPU cores resources</li> </ul> <p></p> <p>Note</p> <p>GPU resources can be requested as either a memory request, a full GPU request or a fraction of a GPU. A fraction of a GPU also supports the selection of a dynamic MIG profile if configured</p>"},{"location":"Researcher/user-interface/workspaces/blocks/compute/#see-also","title":"See Also","text":"<ul> <li>Create a Compute resource. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/blocks/datasources/","title":"Data source introduction","text":"<p>A data source is a location where data sets relevant to the research are stored. Workspaces can be attached to several data sources for reading and writing. The data can be located locally or in the cloud. Run:ai data sources can use a variety of storage technologies such as Git, S3, NFS, PVC, and more.  </p> <p>The data source is an optional building block for the creation of a workspace.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/blocks/datasources/#see-also","title":"See Also","text":"<ul> <li>Create a Data source. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/blocks/environments/","title":"Environment introduction","text":"<p>The environment block consists of the URL path for the container image and the image pull policy. It exposes all the necessary tools (open source, 3rd party, or custom tools) along with their connection interfaces (See also external node port and the container ports.</p> <p>An environment is a mandatory building block for the creation of a workspace. </p> <p></p> <p>You can also include commands, arguments, and environment variables, as well as the user identity with permission to run the commands in the container.</p> <p>Note</p> <p>Additional arguments and environment variables can be added to workspaces even if they were not defined in the environment building block used by the workspace. This ensures that the same environment can still serve many workspaces, even if they differ in their arguments and environment variables.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/environments/#see-also","title":"See Also","text":"<ul> <li>Create an Environment. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/","title":"Create a new Compute Resource","text":"<p>To create a compute resource:</p> <ul> <li>Select the <code>New Compute Resource</code> button.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>Give the resource a meaningful name.</li> </ul> <p></p> <p>A compute resource, is assigned to a single project or all projects (current and future ones). The latter option can only be created by a Run:ai administrator. A compute resource, by design, is shared with all project members.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-the-resources-request","title":"Set the resources request","text":"<p>A resources request is composed of 3 types of resources:</p> <ol> <li>GPU</li> <li>CPU Memory</li> <li>CPU Compute</li> </ol> <p>The user can select one or more resources. For example, one compute resource may consist of a CPU resource request only, whereas a different request can consist of a CPU memory request and a GPU request.</p> <p></p> <p>Note</p> <p>Selecting resources more than the cluster can supply will result in a permanently failed workspace.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-gpu-resources","title":"Set GPU resources","text":"<p>GPU resources can be expressed in various ways:</p> <ol> <li>Request GPU devices: this option supports whole GPUs (e.g. 1 GPU, 2 GPUs, 3 GPUs) or a fraction of GPU (e.g. 0.1 GPU, 0.5 GPU, 0.93 GPU, etc.) </li> <li>Request partial memory of a single GPU device: this option allows to explicitly state the amount of memory needed (e.g. 5GB GPU RAM). </li> <li>Request a MIG profile: this option will dynamically provision the requested MIG profile (if the relevant hardware exists). </li> </ol> <p>Note</p> <ul> <li>Selecting a GPU fraction (e.g. 0.5 GPU) in a heterogeneous cluster may result in inconsistent results: For example, half of a V100 16GB GPU memory is different than A100 with 40GB). In such scenarios. Requesting specific GPU memory is a better strategy.</li> <li>When selecting partial memory of a single GPU device, if NVIDIA MIG is enabled on a node, then the memory can be provided as a MIG profile. For more information see Dynamic MIG. </li> <li>If GPUs are not requested, they will not be allocated even if resources are available. In that case, the project's GPU quota will not be affected.</li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-cpu-resources","title":"Set CPU resources","text":"<p>A CPU resource consists of cores and memory. When GPU resources are requested the user interface will automatically present a proportional amount of CPU cores and memory (as set on the cluster side). </p> <p>Note</p> <p>If no GPU, CPU and memory resources are defined, the request will not be allocated any GPUs. The scheduler will create a container with no minimal CPU and memory. Such a job will run but is likely to be preempted at any time by other jobs. The scheme is relevant for testing and debugging purposes.  </p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/","title":"Create a new data source","text":"<p>When you select <code>New Compute Resource</code> you will be presented with various data source options described below.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-an-nfs-data-source","title":"Create an NFS data source","text":"<p>To create an NFS data source, provide:</p> <ul> <li>A data source name.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>An NFS server.</li> <li>The path to the data within the server.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>The data can be set as read-write or limited to read-only permission regardless of any other user privileges.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-pvc-data-source","title":"Create a PVC data source","text":"<p>To create an PVC data source, provide:</p> <ul> <li>A data source name</li> <li>A Run:ai project scope</li> <li> <p>Select an existing PVC or create a new one by providing:</p> <ul> <li>a claim name</li> <li>a storage class</li> <li>access mode</li> <li>required storage size</li> <li>volume system mode</li> </ul> </li> <li> <p>The path within the container where the data will be mounted.</p> </li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-an-s3-data-source","title":"Create an S3 data source","text":"<p>S3 storage saves data in buckets. S3 is typically attributed to AWS cloud service but can also be used as a separate service unrelated to Amazon. </p> <p>To create an S3 data source, provide</p> <ul> <li>A data source name</li> <li>A Run:ai project scope</li> <li>The relevant S3 service URL server</li> <li>The bucket name of the data. </li> <li>The path within the container where the data will be mounted.</li> </ul> <p>Note that an S3 data source can be public or private. For the latter option, please select the relevant credentials associated with the project to allow access to the data.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-git-data-source","title":"Create a Git data source","text":"<p>To create a Git data source, provide:</p> <ul> <li>A data source name.</li> <li>A Run:ai project scope.</li> <li>The relevant repository URL.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>The Git data source can be public or private. To allow access to a private Git data source, you must select the relevant credentials associated with the project. </p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-host-path-data-source","title":"Create a host path data source","text":"<p>To create a host path data source, provide:</p> <ul> <li>A data source name.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>The relevant path on the host.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>Note that the data can be limited to read-only permission regardless of any other user privileges.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-env/","title":"Creating a new environment","text":"<p>To create an environment:</p> <ol> <li>In the left menu, press New Environment.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>Enter an Environment name.</li> <li>Enter the image URL path and an image pull policy.</li> <li> <p>Select a tool from the list. You can add multiple tools by pressing *+ Tool+. Selecting a tool is optional.</p> <p>Tools can be:</p> <ul> <li>Different applications such as Code editor IDEs (for example, VS Code), Experiment tracking (for example,. Weight and Biases), visualization tools (for example,. Tensor Board), and more.</li> <li>Open source tools (for example, Jupyter notebook) or commercial 3rd party tools (for example,. MATLAB)</li> </ul> <p>It is also possible to set up a custom tool used by the organization.</p> <p>For each tool, you must set the type of connection interface and port. If not set, default values are provided. The supported connection types are:</p> <ul> <li>External URL:  This connection type allows you to connect to your tool either by inserting a custom URL or having one generated for you. Either way, the URL should be unique per workspace as many workspaces may use the same environment. If the URL type was set to custom, the URL will be requested from the Researcher upon creating the workspace.</li> <li>External node port: A NodePort exposes your application externally on every host of the cluster, access the tool using <code>http://&lt;HOST_IP&gt;:&lt;NODEPORT&gt;</code> (for example, http://203.0.113.20:30556).</li> </ul> <p>Note</p> <p>Selecting a tool requires configuration to be up and running. To configure a tool:</p> <ul> <li>The container image needs to support the tool. </li> <li>The administrator must configure a DNS record and certificate. For more information, see Workspaces configuration.</li> </ul> </li> <li> <p>Configure runtime settings with:</p> <ol> <li>Commands and arguments\u2014visible, but not editable in the workspace creation form.</li> <li>Environment variables\u2014visible and editable in the workspace creation form.</li> <li>Set the container's working directory.</li> </ol> <p>Note</p> <p>The value of an environment variable can remain empty for the researcher to fill in when creating a workspace.</p> </li> <li> <p>Configure the security settings from:</p> <ol> <li>Settings in the image\u2014security settings that come with the image file. </li> <li> <p>Custom settings:</p> <pre><code>1. User ID.\n2. Group ID.\n3. Supplementary Groups.\n4. Values modification settings.\n</code></pre> </li> <li> <p>Add linux capabilities.</p> </li> </ol> </li> </ol>"},{"location":"Researcher/user-interface/workspaces/create/workspace/","title":"Workspaces actions and use cases","text":""},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace","title":"Create a new workspace","text":"<p>A Workspace is assigned to a project and is affected by the project\u2019s quota just like any other workload. A workspace is shared with all project members for collaboration.</p> <p>To create a workspace, you must provide:</p> <ul> <li>At least one project </li> <li>A researcher assigned to at least one project</li> </ul> <p>To create a workspace, the researcher must select building blocks  in one of two ways:</p> <ul> <li>Create a workspace from scratch:  this allows you to either select an existing building block or create them on the fly (pending the right permissions).</li> <li>Create a workspace from a template: a template contains a set of predefined building blocks as well as additional configurations which allow the user to immediately create a templated-based workspace.</li> </ul> <p>Note</p> <p>Where there is a card gallery, use the search bar to find specific cards based on title or field values.</p> <p>To create a workspace:</p> <ul> <li>Press <code>New Workspace</code> </li> <li>Select a project for the new workspace. The project visualization contains information about the project such as how much of the quota is being allocated and indicates the likelihood of the workspace being scheduled or left in the queue</li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace-from-scratch","title":"Create a new workspace from scratch","text":"<p>See picture:</p> <p></p> <p>Note</p> <p>The building block can also be created (and then selected) directly from within the workspace creation form.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-an-environment-for-a-new-workspace","title":"Select an Environment for a new workspace","text":"<p>An environment is a mandatory element of a workspace. All environments created for the project will be shown to researchers in the form of a gallery view (see also Creating a new environment). Each tile shows the tools as well as the image. When selecting an environment, the command, arguments and environment variables defined in the environment are visible for review. The researcher can edit arguments and environment variables that are specific to the current workspace and that are not part of the common shared environment. In some cases, it would even be expected that the researcher will provide additional information (for example, values for environment variables) to successfully create the workspace (see also Create new environment).</p> <p></p> <p>You can also decide whether the workspace is preemptible or not (see also create a preemptible worksapce). By default, interactive sessions are limited to the project\u2019s GPU, meaning that they can only be scheduled (and activated) when there is an available and sufficient GPU quota.  With the following parameter, the researcher can determine whether the workspace is allowed to go over-quota with the understanding that it can be preempted if other projects would demand back their quota.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-a-compute-resource-for-a-new-workspace","title":"Select a compute resource for a new workspace","text":"<p>Selecting compute resources for the workspace is a mandatory step. If compute resources are created for the project (see also creating a new compute resource), those will be offered to researchers in the form of a gallery view. Each tile shows the amount of GPU, CPU and Memory in the request.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-a-data-source-for-a-new-workspace","title":"Select a data source for a new workspace","text":"<p>Selecting a data source for the workspace is a non-mandatory step. If data sources are created for the project (see also creating a new compute resource), those will be offered to researchers in the form of a gallery view. Each tile shows the unique name of the building block and the type of data source.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace-from-a-template","title":"Create a new workspace from a template","text":"<p>Templates ease the way of creating a new workspace in a few clicks. In contrast to creating a workspace from scratch (selecting manually which building blocks to use in your workspace), a template aggregates all building blocks under a single entity for researchers to use for the creation of workspaces.</p> <p></p> <p>A Template consists of the building blocks and other parameters that are exposed in a workspace creation form. Templates can be fully defined to a point researcher can select and create the workspace without providing any additional information or partially defined, hence, leaving some degree of freedom in the creation of the workspace via the template. This can help in cases where only part of the configuration is selected in the template and the rest is expected to be provided by the user creating a workspace from the template. </p> <p>Few examples: </p> <ul> <li>A template can have the value of an environment variable empty for the researcher to edit later during the workspace creation.</li> <li>A template can consist of an environment with a tool that requests a custom URL. This URL field stays empty until the researcher fills it upon creating the workspace</li> </ul> <p>For collaboration purposes, templates are assigned to a specific project and are shared with all project members by design.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-preemptible-workspace","title":"Create a preemptible workspace","text":"<p>For a better experience, workspaces, as they are built for interactive research, are designed to not be preempted (because the researchers actively interact with GPU resources). Thus, non-preemptible workspaces can be only scheduled if the project has a sufficient vacant quota. However, if that\u2019s not the case (the project does not have a sufficient vacant quota) and the researcher still needs to create and activate a workspace (if cluster resources are available) he/she can allow the workspace to go over-quota, thus be scheduled, but with the cost of preemption without prior notice.</p> <p></p>"},{"location":"admin/overview-administrator/","title":"Overview: Administrator Documentation","text":"<p>The role of Administrators is to set up Run:ai and perform day-to-day monitoring and maintenance. </p> <p>As part of the Administrator documentation you will find:</p> <ul> <li>Run:ai Setup How to set up and modify a GPU cluster with Run:ai.</li> <li>Researcher Setup How to set up Researchers to work with Run:ai.</li> <li>How to configure Workloads and Workload Policies.</li> <li>Setting and maintaining the cluster via the  Run:ai User Interface.</li> <li>Troubleshooting Run:ai and understanding cluster health.</li> <li>Integrations of Run:ai with a variety of other systems.</li> </ul>"},{"location":"admin/admin-ui-setup/admin-ui-users/","title":"Adding, Updating and Deleting Users","text":""},{"location":"admin/admin-ui-setup/admin-ui-users/#introduction","title":"Introduction","text":"<p>The Run:ai User Interface allows the creation of Run:ai Users. Run:ai Users can receive varying levels of access to the Administration UI and submit Jobs on the Cluster.</p> <p>Tip</p> <p>It is possible to connect the Run:ai user interface to the organization's directory and use single sign-on. This allows you to set Run:ai roles for users and groups from the organizational directory. For further information see single sign-on configuration.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#working-with-users","title":"Working with Users","text":"<p>You can create users, as well as update and delete users.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#create-a-user","title":"Create a User","text":"<p>Note</p> <p>To be able to review, add, update and delete users, you must have an Administrator access. If you do not have such access, please contact an Administrator.</p> <p> Department Admin is available in version 2.10 and later.</p> <ol> <li>Login to the Users area of the Run:ai User interface at <code>company-name.run.ai</code>.</li> <li>Select the <code>Users</code> tab for local users, or the <code>SSO Users</code> tab for SSO users.</li> <li>On the top right, select \"NEW USER\".</li> <li>Enter the user's email.</li> <li> <p>Select Roles. More than one role can be selected. Available roles are:</p> <ul> <li>Administrator\u2014Can manage Users and install Clusters.</li> <li>Editor\u2014Can manage Projects and Departments.</li> <li>Viewer\u2014View-only access to the Run:ai User Interface.</li> <li>Researcher\u2014Can submit ML workloads. Setting a user as a Researcher also requires assigning the user to projects.</li> <li>Research Manager\u2014Can act as Researcher in all projects, including new ones to be created in the future.</li> <li>ML Engineer\u2014Can view and manage deployments and cluster resources. Available only when Inference module is installed.</li> <li>Department Administrator\u2014Can manage Departments, descendent Projects and Workloads.</li> </ul> <p>For more information, Roles and permissions.</p> </li> <li> <p>(Optional) Select Cluster(s). This determines what Clusters are accessible to this User.</p> </li> <li>Press \"Save\".</li> </ol> <p>You will get the new user credentials and have the option to send the credentials by email.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#roles-and-permissions","title":"Roles and permissions","text":"<p>Roles provide a way to group permissions and assign them to either users or user groups. The role identifies the collection of permissions that administrators assign to users or user groups. Permissions define the actions that users can perform on the managed entities. The following table shows the default roles and permissions.</p> Managed Entity   /  Roles Admin Dep. Admin Editor Research Manager Researcher ML Eng. Viewer Assign (Settings) Users/Groups/Apps to Roles CRUD (all roles) CRUD (Proj. Researchers and ML Engineers only) N/A N/A N/A N/A N/A Assign Users/Groups/Apps to Organizations R (Projects, Departments) CRUD (Projects only) CRUD (Projects, Departments) N/A N/A N/A N/A Departments R R CRUD N/A N/A R R Projects R CRUD CRUD R R R R Jobs R R R R CRUD N/A R Deployments R R R N/A N/A CRUD R Workspaces R R R R CRUD N/A N/A Environments CRUD CRUD CRUD CRUD CRUD N/A N/A Data Sources CRUD CRUD CRUD CRUD CRUD N/A N/A Compute Resources CRUD CRUD CRUD CRUD CRUD N/A N/A Templates CRUD CRUD CRUD CRUD CRUD N/A N/A Clusters CRUD N/A R N/A N/A R R Node Pools CRUD N/A R N/A N/A R R Nodes R N/A R N/A N/A R R Settings (General, Credentials) CRUD N/A N/A N/A N/A N/A N/A Events History R N/A N/A N/A N/A N/A N/A Dashboard.Overview R R R R R R R Dashboards.Analytics R R R R R R R Dashboards.Consumption R N/A N/A N/A N/A N/A N/A <p>Permissions:    C = Create, R = Read, U = Update, D = Delete</p>"},{"location":"admin/admin-ui-setup/credentials-setup/","title":"Credentials","text":"<p>Credentials are used to unlock protected resources such as applications, containers, and other assets.</p>"},{"location":"admin/admin-ui-setup/credentials-setup/#types-of-credentials","title":"Types of credentials","text":"<p>The Credential manager in the Run:ai environment supports 4 types of credentials:</p> <ol> <li>Docker registry.</li> <li>Access key.</li> <li>User name and password.</li> <li>Kubernetes Secrets.</li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#configuring-credentials","title":"Configuring Credentials","text":"<p>Prerequisites</p> <ol> <li><code>Workspaces</code> are enabled.</li> <li>Target resource user-id and password for creating a secret in the UI.</li> <li>Configured pre-existing secrets with the applicable <code>label</code>.</li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#docker-registry","title":"Docker registry","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Docker registry</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.</p> </li> <li> <p>If you choose <code>New secret</code>, enter a username and password.</p> </li> </ul> </li> <li> <p>Enter a URL for the docker registry, then press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#access-key","title":"Access key","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Access key</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.  </p> </li> <li> <p>If you choose <code>New secret</code>, enter an access key and access secret.</p> </li> </ul> </li> <li> <p>Press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#username-and-password","title":"Username and password","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Username &amp; password</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.</p> </li> <li> <p>If you choose <code>New secret</code>, enter a username and password.</p> </li> </ul> </li> <li> <p>Press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#kubernetes-created-secret","title":"Kubernetes created secret","text":"<p>You can use the Kubernetes Secrets creation tool to create a pre-existing secret that can be used when creating the credential. You must <code>label</code> these secrets so that they are registered in the Run:ai environment.</p> <p>The following command makes the secret available to all projects in the cluster.</p> <pre><code>kubectl label secret -n runai &lt;SECRET_NAME&gt; run.ai/cluster-wide-credentials=true\n</code></pre> <p>The following command makes the secret available to a specific project in the cluster.</p> <pre><code>kubectl label secret -n &lt;NAMESPACE_OF_PROJECT&gt; &lt;SECRET_NAME&gt; run.ai/credentials=true\n</code></pre>"},{"location":"admin/admin-ui-setup/credentials-setup/#user-id-and-password","title":"User-id and password","text":"<p>You can create a credential using a user-id and password. Use the user-id and password of the target resource.</p>"},{"location":"admin/admin-ui-setup/credentials-setup/#docker-registry_1","title":"Docker registry","text":""},{"location":"admin/admin-ui-setup/dashboard-analysis/","title":"Introduction","text":"<p>The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Jobs. This document provides the key metrics to monitor, how to assess them as well as suggested actions.</p> <p>There are 5 dashboards:</p> <ul> <li>Overview dashboard\u2014Provides information about what is happening right now in the cluster.</li> <li>Quota Management dashboard\u2014Provides information about quota utilization.</li> <li>Analytics dashboard\u2014Provides long term analysis of cluster behavior.</li> <li>Multi-Cluster Overview dashboard\u2014Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.</li> <li>Consumption dashboard\u2014Provides information about resource consumption.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#overview-dashboard","title":"Overview Dashboard","text":"<p>The Overview dashboard provides information about what is happening right now in the cluster.  Administrators can view high-level information on the state of the cluster, including:</p> <ul> <li>The number of available and allocated resources and their cluster-wide utilization.</li> <li>The number of running and pending Jobs, their utilization, information on Jobs with errors or Jobs with idle GPUs.</li> <li>Active Projects, their assigned and allocated GPUs and number of running and pending Jobs.</li> </ul> <p>The dashboard has a dropdown filter for node pools. From the dropdown, select one or more node pools. The default setting is <code>all</code>.</p> <p>Cluster administrators can use the Overview dashboard to find issues and fix them. Below are a few examples:</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-idle-gpus","title":"Jobs with idle GPUs","text":"<p>Locate Jobs with idle GPUs, defined as GPUs with 0% GPU utilization for more than 5 minutes.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis  &amp; Actions Interactive Jobs are too frequently idle *  Consider setting time limits for interactive Jobs through the Projects tab.\u00a0 *  Consider also reducing GPU quotas for specific Projects to encourage users to run more training Jobs as opposed to interactive Jobs (note that interactive Jobs can not use more than the GPU quota assigned to their Project). Training Jobs are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-an-error","title":"Jobs with an Error","text":"<p>Search for Jobs with an error status. These Jobs may be holding GPUs without actually using them.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Search for Jobs with an Error status on the Jobs view and discuss with the Job owner. Consider deleting these Jobs to free up the resources for other users.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-a-long-duration","title":"Jobs with a Long Duration","text":"<p>View list of 5 longest Jobs.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis &amp; Actions Training Jobs run for too long Ask users to view their Jobs and analyze whether useful work is being done. If needed, stop their Jobs. Interactive Jobs run for too long Consider setting time limits for interactive Jobs via the Project editor."},{"location":"admin/admin-ui-setup/dashboard-analysis/#job-queue","title":"Job Queue","text":"<p>Identify queueing bottlenecks.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis &amp; Actions Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster. Cluster is not fully loaded Go to the Jobs view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory). Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job. <p>Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-management-dashboard","title":"Quota management dashboard","text":"<p>The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:</p> <ul> <li>Add Filter</li> <li>Quota / Total</li> <li>Allocated / Quota</li> <li>Pending workloads</li> <li>Quota by node pool</li> <li>Allocation by node pool</li> <li>Pending workloads by node pool</li> <li>Departments with lowest allocation by node pool</li> <li>Projects with lowest allocation ratio by node pool</li> <li>Over time allocation / quota</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#add-filter","title":"Add Filter","text":"<p>Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:</p> <ul> <li>Departments</li> <li>Projects</li> <li>Nodes</li> </ul> <p>Select a filter from the dropdown, then select a item from the list, and press apply.</p> <p>Note</p> <p>You can create a filter with multiple categories, but you can use each category and item only once.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-total","title":"Quota / Total","text":"<p>This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#allocated-quota","title":"Allocated / Quota","text":"<p>This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-workloads","title":"Pending workloads","text":"<p>This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-by-node-pool","title":"Quota by node pool","text":"<p>This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#allocation-by-node-pool","title":"Allocation by node pool","text":"<p>This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-workloads-by-node-pool","title":"Pending workloads by node pool","text":"<p>This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#departments-with-lowest-allocation-by-node-pool","title":"Departments with lowest allocation by node pool","text":"<p>This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#projects-with-lowest-allocation-ratio-by-node-pool","title":"Projects with lowest allocation ratio by node pool","text":"<p>This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#over-time-allocation-quota","title":"Over time allocation / quota","text":"<p>This section shows the allocation of GPUs from the quota over a period of time.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#analytics-dashboard","title":"Analytics Dashboard","text":"<p>The Analytics dashboard provides means for viewing historical data on cluster information such as:</p> <ul> <li>Utilization across the cluster</li> <li>GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Jobs</li> <li>Breakdown of running Jobs into interactive, training, and GPU versus CPU-only Jobs, including information on queueing (number of pending Jobs and requested GPUs),</li> <li>Status of Nodes in terms of availability and allocated and utilized resources.</li> </ul> <p>The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is <code>all</code>.</p> <p>The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#node-downtime","title":"Node Downtime","text":"<p>View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.</p> <p>How to: view the following panel.</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Filter according to time range to understand for how long the Node is down.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#gpu-allocation","title":"GPU Allocation","text":"<p>Track GPU allocation across time.</p> <p>How to: view the following panels.</p> <p></p> <p>The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.</p> <p>Analysis and Suggested actions:</p> <p>If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#track-gpu-utilization","title":"Track GPU utilization","text":"<p>Track whether Researchers efficiently use the GPU resources they have allocated for themselves.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>If utilization is too low for a long period, you will want to identify the source of the problem:</p> <ul> <li>Go to \u201cAverage GPU Allocation &amp; Utilization\u201d</li> <li>Look for Projects with large GPU allocations for interactive Jobs or Projects that poorly utilize their training Jobs. Users tend to poorly utilize their GPUs in interactive sessions because of the dev &amp; debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don\u2019t shut down their interactive Jobs, holding their GPUs idle and preventing others from using them.</li> </ul> Review Analysis &amp; Actions Low GPU utilization is due to interactive Jobs being used too frequently Consider setting time limits for interactive Jobs through the Projects tab or reducing GPU quotas to encourage users to run more training Jobs as opposed to interactive Jobs (note that interactive Jobs can not use more than the GPU quota assigned to their Project). Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Jobs, notify the users and work with them to improve their code and the way they utilize their GPUs."},{"location":"admin/admin-ui-setup/dashboard-analysis/#training-vs-interactive-researcher-maturity","title":"Training vs. Interactive -- Researcher maturity","text":"<p>Track the number of running Jobs and the breakdown into interactive, training, and CPU-only Jobs.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>We would want to encourage users to run more training Jobs than interactive Jobs, as it is the key to achieving high GPU utilization across the Cluster:</p> <ul> <li>Training Jobs run to completion and free up their resources automatically when training ends</li> <li>Training Jobs can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-queue-size","title":"Pending Queue Size","text":"<p>Track how long is the queue for pending Jobs</p> <p>How to: view the following panels:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Consider buying more GPUs:</p> <ul> <li>When there are too many Jobs are waiting in queue for too long.</li> <li>With a large number of requested GPUs.</li> <li>While the Cluster is fully loaded and well utilized.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#cpu-memory-utilization","title":"CPU &amp; Memory Utilization","text":"<p>Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way jobs are running.</p> <p>Consider adding more CPUs, or adding additional CPU-only nodes for Jobs that do only CPU processing.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#multi-cluster-overview-dashboard","title":"Multi-Cluster overview dashboard","text":"<p>Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.</p> <p></p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#consumption-dashboard","title":"Consumption dashboard","text":"<p>This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines.</p> <p></p> <p>The dashboard has 4 dashlets for:</p> <ul> <li>Cumulative GPU allocation per Project or Department</li> <li>Cumulative CPU allocation per Project or Department</li> <li>Cumulative memory allocation per Project or Department</li> <li>Consumption types</li> </ul> <p>Use the drop down menus at the top of the dashboard to apply filters for:</p> <ul> <li>Project or department</li> <li>Per project (single, multiple, or all)</li> <li>Per department (single, multiple or all)</li> </ul> <p>Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.</p> <p>Note</p> <p>Dashboard data updates once an hour.</p> <p></p> <p>You can change the refresh interval using the refresh interval drop down.</p> <p>The dashboard has a Total consumption table that displays the total consumption of resources based on:</p> <ul> <li>Project</li> <li>Department</li> <li>GPU hours</li> <li>CPU hours</li> <li>Memory hours</li> </ul> <p>Hover over an entry in the table to filter it in or out of the table.</p> <p>The dashboard has a graph of the GPU allocation over time.</p> <p>!</p> <p>The dashboard has a graph of the Project over-quota GPU consumtion.</p> <p>!</p>"},{"location":"admin/admin-ui-setup/department-setup/","title":"Introduction","text":"<p>Researchers submit Jobs. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as create segregation between different initiatives. A project in most cases represents a team, an individual, or an initiative that shares resources or has a specific resources budget (quota).</p> <p>A Researcher submitting a Job needs to associate a Project name with the request. The Run:ai scheduler will compare the request against the current allocations and the Project and determine whether the workload can be allocated resources or whether it should remain in the queue for future allocation.</p> <p>In some organizations, Projects may not be enough, this is because:</p> <ul> <li>There are simply too many individual entities that are attached to a quota.</li> <li>There are organizational quotas at a higher level.</li> </ul>"},{"location":"admin/admin-ui-setup/department-setup/#departments","title":"Departments","text":"<p>Departments create a secondary hierarchy of resource allocation:</p> <ul> <li>A Project is associated with a single Department. Multiple Projects can be associated with the same Department.</li> <li>A Department, like a Project is associated with a Quota. </li> <li>It is recommended that a Department's quota supersedes the sum of all its associated Projects' quota.</li> </ul>"},{"location":"admin/admin-ui-setup/department-setup/#node-pools-and-quota-settings","title":"Node Pools and Quota settings","text":"<p>For detailed information on node pools, see Using node pools.</p> <p>By default, all nodes in a cluster are part of the <code>Default</code> node pool. The administrator can choose to create new node pools and include a set of nodes in a node pool by associating the nodes with a label.</p> <p>If the node pools feature is disabled, all GPU and CPU resources are directly associated with the Department's Quota. </p> <p>Once an Administrator enables node pools, all GPU and CPU resources will be included in the <code>Default</code> node pool and summed up to the Department's overall Quotas.</p> <p>An administrator can create a new node pool and associate nodes into this pool. Any new pool is automatically associated with all Departments and Projects within a cluster, with a GPU and CPU resource Quota of zero. The Administrator can then change the Quota of any node-pool resource per Department and Project. The Quota of node-pool X within Department Y should be at least the sum of the same node-pool X Quota across all associated Projects. This means an administrator should carefully plan the resource Quota allocation from the Department to its descendent Projects. The overall Quota of the Department is the sum of all its associated node pools. </p>"},{"location":"admin/admin-ui-setup/department-setup/#over-quota-behavior","title":"Over-quota behavior","text":"<p>Consider an example from an academic use case: the Computer Science Department and the GeoPhysics Department have each purchased 10 nodes with 8 GPUs for each node, totaling a cluster of 160 GPUs for both departments. The two Departments do not mind sharing GPUs as long as they always get their 80 GPUs when they truly need them. As such, there could be many Projects in the GeoPhysics Department, totaling an allocation of 100 GPUs, but anything above 80 GPUs will be considered by the Run:ai scheduler as over-quota. For more details on over-quota scheduling see the Run:ai Scheduler. In case node pools are enabled, the same rule applies per node pool, i.e. if a job tries to use resources that supersede a node pool Department's quota - it will be considered as Over-Quota.</p> <p>Important</p> <p>Best practice: As a rule, the sum of the Departments' Quota allocations should be equal to the number of GPUs in the cluster.</p>"},{"location":"admin/admin-ui-setup/department-setup/#creating-and-managing-departments","title":"Creating and Managing Departments","text":""},{"location":"admin/admin-ui-setup/department-setup/#enable-departments","title":"Enable Departments","text":"<p>Departments are disabled by default. To start working with Departments:</p> <ul> <li>Go to <code>Settings</code> | <code>General</code>.</li> <li>Enable <code>Departments</code>.</li> </ul> <p>Once Departments are enabled, the left-side menu will have a new item named Departments.</p> <p>Under Departments there will be a single department named default. All Projects created before the Department feature was enabled will belong to the default department.</p>"},{"location":"admin/admin-ui-setup/department-setup/#adding-departments","title":"Adding Departments","text":"<p>To add a new department:</p> <ol> <li>In the Departments grid, press New Department.</li> <li>Enter a name.</li> <li>In Quota management configure the number GPUs, CPUs, and CPU memory.</li> <li>In Access control select a user or application to be department administrator. If there are no users assigned the role of department administrator, see Assigning Department Administrator role.</li> </ol>"},{"location":"admin/admin-ui-setup/department-setup/#assigning-department-administrator-role","title":"Assigning Department Administrator role","text":"<p>You can create a new user with the Department Administrator role, or add the role to existing users. To create a new user with this role, see Create a user. To add this role to an existing user:</p> <ol> <li>Go to <code>Settings | Users</code>.</li> <li>Select a user from the list and then press <code>Edit User</code>.</li> <li>Select the <code>Department Admin</code> role from the list. (Deselect to remove the role from the user).</li> <li>Press save when complete.</li> </ol> <p>After you have created the user with the Department Administrator role, you will need to assign the user to the correct department.</p> <p>To assign the Department Administrator user to the correct department:</p> <ol> <li>Go to <code>Settings | Departments</code>.</li> <li>Select a department from the list, then press <code>Edit</code>. If you do not have a department, you will need to create one. See Adding a new department.</li> <li>Select <code>Department Administrator</code>, then select <code>Users</code> or <code>Applications</code>.</li> <li>If you selected <code>Users</code>, select one or more users from the drop down menu.</li> <li>Press save when complete.</li> </ol>"},{"location":"admin/admin-ui-setup/department-setup/#assigning-projects-to-departments","title":"Assigning Projects to Departments","text":"<p>Under Projects edit an existing Project. You will see a new Department drop-down with which you can associate a Project with a Department.</p>"},{"location":"admin/admin-ui-setup/deployments/","title":"Viewing and Submitting Deployments","text":"<p>The Run:ai User interface Deployment area allows the viewing and submitting of Deployments for serving inference workloads. Submitting inference workloads can only be done if your user has <code>ML Engineer</code> access.</p>"},{"location":"admin/admin-ui-setup/deployments/#deployment-list","title":"Deployment list","text":"<p>The main view shows a list of Deployments:</p> <p></p>"},{"location":"admin/admin-ui-setup/deployments/#submit-a-deployment","title":"Submit a Deployment","text":"<p>On the top right, you can choose to Submit a new Deployment. </p> <p>Note</p> <p>If knative is not installed in your cluster the button will be grayed out.</p> <p>A Deployment form will open: </p> <p></p> <p>Note</p> <p>If the Deploy button is disabled or does not exist, then your cluster is not installed or configured to connect to the cluster see here for more information.</p>"},{"location":"admin/admin-ui-setup/deployments/#deployment-properties","title":"Deployment Properties","text":"<p>When selecting a single Deployment, a right-pane appears:</p> <p></p> <p>This multi-tab view provides information about Deployment details, related Pods, Deployment status history, and various utilization graphs. </p>"},{"location":"admin/admin-ui-setup/jobs/","title":"Viewing and Submitting Jobs","text":"<p>The Run:ai User interface Job area allows the viewing of Jobs and Job details. It also allows the Researcher to submit Jobs, suspend and resume Jobs and delete Jobs.</p>"},{"location":"admin/admin-ui-setup/jobs/#job-list","title":"Job list","text":"<p>The main view shows a list of Jobs. The list can be filtered and sorted:</p> <p></p>"},{"location":"admin/admin-ui-setup/jobs/#submit-a-job","title":"Submit a Job","text":"<p>On the top right, you can choose to Submit a new Job. A Job form will open:</p> <p></p> <p>Note</p> <p>If the Submit Job button is disabled or does not exist, then your cluster is not installed or configured to connect to the cluster see here for more information.</p>"},{"location":"admin/admin-ui-setup/jobs/#job-properties","title":"Job Properties","text":"<p>When selecting a single Job, a right-pane appears:</p> <p></p> <p>This multi-tab view provides information about Job details, related Pods, Job status history, and various utilization graphs. You can also view internal Job logs as shown here:</p> <p></p>"},{"location":"admin/admin-ui-setup/jobs/#other-operations","title":"Other operations","text":"<p>You can also delete a selected Job or suspend/resume a selected Job.</p>"},{"location":"admin/admin-ui-setup/overview/","title":"User Interface Overview","text":"<p>Run:ai provides a single user interface that, depending on your role, serves both as a control-plane management tool and a researcher workbench. </p> <p>The control-plane part of the tool allows the administrator to:</p> <ul> <li>Analyze cluster status using dashboards.</li> <li>Manage Run:ai metadata such as users, departments, and projects. </li> <li>View Job details to be able to help researchers solve Job-related issues.</li> </ul> <p>The researcher workbench part of the tool allows Researchers to submit, delete and pause Jobs, view Job logs etc.</p>"},{"location":"admin/admin-ui-setup/overview/#setup","title":"Setup","text":"<p>The cluster installation process requires configuring a new cluster and downloading a YAML file.  On SaaS-based installations, the cluster creation wizard requires a URL to the cluster as explained here.</p>"},{"location":"admin/admin-ui-setup/overview/#architecture","title":"Architecture","text":"<ul> <li>Run:ai saves metadata such as users, projects, departments, clusters, and tenant settings, in the control plane residing on the Run:ai cloud.</li> <li>Workload information resides on (sometimes multiple) GPU clusters. </li> <li>The Run:ai user interface needs to work with both sources of information. </li> </ul> <p>As such, the chosen architecture of the user interface is:</p> <p></p> <ul> <li>The user interface is served from the management backend.</li> <li>The user interface connects directly to multiple GPU clusters using cross-origin access. This works using CORS: Cross-origin resource sharing. This allows submitting workloads and getting extended logging information directly from the GPU clusters. </li> <li>Meta-data, such as Projects, Settings, and Job information is synced into the management backend via a cluster-sync service. Cluster-sync creates an outbound-only channel with no incoming HTTPS connections.  </li> </ul> <p>Important</p> <p>One corollary of this architecture is that for SaaS-based tenants, the user interface will only be able to access the cluster when the browser is inside the corporate firewall. When working outside the firewall. Workload-related functionality such as Submitting a Job, viewing Job lots etc, is disabled. </p>"},{"location":"admin/admin-ui-setup/project-setup/","title":"Introduction","text":"<p>Researchers submit Jobs. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as create segregation between different initiatives. A project in most cases represents a team, an individual, or an initiative that shares resources or has a specific resources budget (quota).</p> <p>A Researcher submitting a Job needs to associate a Project name with the request. The Run:ai scheduler will compare the request against the current allocations and the Project and determine whether the workload can be allocated resources or whether it should remain in the queue for future allocation.</p>"},{"location":"admin/admin-ui-setup/project-setup/#modeling-projects","title":"Modeling Projects","text":"<p>As an Admin, you need to determine how to model Projects. You can:</p> <ul> <li>Set a Project per user.</li> <li>Set a Project per team of users.</li> <li>Set a Project per a real organizational Project.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#node-pools","title":"Node Pools","text":"<p>For detailed information on node pools, see Using node pools.</p> <p>By default, all nodes in a cluster are part of the <code>Default</code> node pool. The administrator can choose to create new node pools and include a set of nodes in a node pool by associating the nodes with a label.</p> <p>Each node pool is automatically associated with all Projects and Departments with zero resource allocation (Quotas). When submitting a Job (or Deployment), the Researcher can choose one or more node pools. When choosing more than one node pool, the researcher sets the order of priority between the chosen node pools. The scheduler will try to schedule the Job to the first node pool. If not successful the scheduler will try the second node pool in the list, and so forth until it finds a node pool that can provide the Job's specification.</p> <p>An administrator can set a Project's <code>default priority list</code> of node pools. In case the Researcher did not specify any node pool (or node pool list), the scheduler will use the Project's default node pool priority list to determine the order that the scheduler will use when scheduling the Job.</p>"},{"location":"admin/admin-ui-setup/project-setup/#project-quotas","title":"Project Quotas","text":"<p>Each Project is associated with a total quota of GPU and CPU resources (CPU Compute &amp; CPU Memory) that can be allocated for the Project at the same time. This total is the sum of all node pools' quotas associated with this Project. This is guaranteed quota in the sense that Researchers using this Project are guaranteed to get this amount of GPU and CPU resources, no matter what the status in the cluster is.</p> <p>Beyond that, a user of this Project can receive an over-quota (The administrator needs to enable over-quota per project). As long as GPUs are unused, a Researcher using this Project can get more GPUs. However, these GPUs can be taken away at a moment's notice. When the node pools flag is enabled, over-quota is effective and calculated per node pool, this means that a workload requesting resources from a certain node pool can get its resources from a quota that belongs to another Project for the same node pool if the resources are exhausted for this Project and available on another Project. For more details on over-quota scheduling see the Run:ai Scheduler.</p> <p>Important</p> <p>Best practice: As a rule, the sum of the Projects' allocations should be equal to the number of GPUs in the cluster.</p>"},{"location":"admin/admin-ui-setup/project-setup/#controlling-over-quota-behavior","title":"Controlling Over-Quota Behavior","text":"<p>By default, the amount of over-quota available for Project members is proportional to the original quota provided above. The Run:ai scheduler document provides further examples which show how over-quota is distributed amongst competing Projects.</p> <p>As an administrator, you may want to disconnect the two parameters. So, for example, a Project with a high quota will receive little or no over-quota. To perform this:</p> <ul> <li>Under <code>Settings | General</code> turn on the <code>Enable Over-quota Priority</code> feature</li> <li>When creating a new Project, you can now see a slider for over-quota priority ranging from <code>None</code> to <code>High</code></li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#create-a-project","title":"Create a Project","text":"<p>Note</p> <p>To be able to create or edit Projects, you must have Editor access. See the Users documentation.</p> <ol> <li>In the left-menu, press Projects. 1.5 On the top right, select \"Add New Project\"</li> <li>Choose a Department from the drop-down. The default is <code>default</code>.</li> <li>Enter a Project name. Press Namespace to set the namespace associated with the project. You can either create the namespace from the project name (default) or enter an existing namespace.</li> <li>In Access control, add one or more applications or users. If your user or application isn't in the list, see Roles and permissions, and verify that the users have the correct permissions. To change user permissions, see Working with users.</li> <li> <p>In Quota management, configure the node pool priority (if editable), the GPUs, CPUs, CPU memory, and Over-quota priority settings. Configure the following:</p> <ul> <li>Order of priority\u2014the priority the node pool will receive when trying to schedule workloads. For more information, see Node pool priority.</li> <li>GPUs\u2014the number of GPUs in the node pool. Press GPUs and enter the number of GPUs, then press Apply to save.</li> <li>CPUs(Cores)\u2014the number of CPU cores in the node pool. Press CPUs and enter the number of GPUs, then press Apply to save.</li> <li>CPU Memory\u2014the amount of memory the CPUs will be allocated. Press CPU Memory, enter an amount of memory, then press Apply to save.</li> <li>Over-quota priority\u2014the priority for the specific node pool to receive over-quota allocations.</li> </ul> </li> <li> <p>(Optional) In the Scheduling rules pane, use the dropdown arrow to open the pane. Press on the + Rule button to add a new rule to the project. Add one (or more) of the following rule types:</p> <ul> <li>Idle GPU timeout\u2014controls the amount of time that specific workload GPUs which are idle will be remain assigned to the project before getting reassigned.</li> <li>Workspace duration\u2014limit the length of time a workspace will before being terminated.</li> <li>Training duration\u2014limit the length of time training workloads will run.</li> <li>Node type (Affinity)\u2014limits specific workloads to run on specific node types.</li> </ul> </li> </ol>"},{"location":"admin/admin-ui-setup/project-setup/#assign-users-to-project","title":"Assign Users to Project","text":"<p>When Researcher Authentication is enabled, the Project form will contain an additional Access Control tab. The tab will allow you to assign Researchers to their Projects.</p> <p>If you are using Single-sign-on, you can also assign Groups</p>"},{"location":"admin/admin-ui-setup/project-setup/#other-project-properties","title":"Other Project Properties","text":""},{"location":"admin/admin-ui-setup/project-setup/#limit-jobs-to-run-on-specific-node-groups","title":"Limit Jobs to run on Specific Node Groups","text":"<p>You can assign a Project to run on specific nodes (machines). This is achieved by two different mechanisms:</p> <ul> <li> <p>Node Pools:         All node pools in the system are associated with each Project. Each node pool can allocate GPU and CPU resources (CPU Compute &amp; CPU Memory) to a Project. By associating a quota on specific node pools for a Project, you can control which nodes a Project can utilize and which default priority order the scheduler will use (in case the workload did choose so by itself). Each workload should choose the node pool(s) to use, if no choice is made, it will use the Project's default 'node pool priority list'. Note that node pools with zero resources associated with a Project or node pools with exhausted resources can still be used by a Project when the Over-quota flag is enabled.</p> </li> <li> <p>Node Affinities (aka Node Type)         Administrator can associate specific node sets characterized by a shared run-ai/node-type label value to a Project. This means descendant workloads can only use nodes from one of those node affinity groups. A workload can specify which node affinity to use, out of the list is bounded to its parent Project.</p> </li> </ul> <p>There are many use cases and reasons to use specific nodes for a Project and its descendant workloads, here are some examples:</p> <ul> <li>The project team needs specialized hardware (e.g. with enough memory).</li> <li>The project team is the owner of specific hardware which was acquired with a specialized budget.</li> <li>We want to direct build/interactive workloads to work on weaker hardware and direct longer training/unattended workloads to faster nodes.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#the-difference-between-node-pools-and-affinities","title":"The difference between Node Pools and Affinities","text":"<p>Node pools represent an independent scheduling domain per Project, therefore are completely segregated from each other. To use a specific node pool (or node pools), any workload must specify the node pool(s) it would like to use. While for affinities, workloads that ask for a specific affinity will only be scheduled to nodes marked with that affinity, while workloads that did not specify any affinity might be scheduled as well to those nodes with an affinity. Therefore the scheduler cannot guarantee quota for node affinities, only to node pools.</p> <p>Note that using node pools and affinities narrows down the scope of nodes a specific project is eligible to use. It, therefore, reduces the odds of a specific workload under that Project getting scheduled. In some cases, this may reduce the overall system utilization.</p>"},{"location":"admin/admin-ui-setup/project-setup/#grouping-nodes-using-node-pools","title":"Grouping Nodes using Node Pools","text":"<p>To create a node pool you must first annotate nodes with a label or use an existing node label, as the key for grouping nodes into pools. You can use any unique label (in the format <code>key:value</code>) to form a node pool. a node pool is characterized by a label but also has its own unique node pool name.</p> <p>To get the list of nodes and their current labels, run:</p> <pre><code>kubectl get nodes --show-labels\n</code></pre> <p>To annotate a specific node with the label <code>dgx-2</code>, run:</p> <pre><code>kubectl label node &lt;node-name&gt; node-model=dgx-2\n</code></pre> <p>You can annotate multiple nodes with the same label.</p> <p>To create a node pool with the chosen common label use the create node pool Run:ai API.</p>"},{"location":"admin/admin-ui-setup/project-setup/#setting-node-pools-for-a-specific-project","title":"Setting Node Pools for a Specific Project","text":"<p>By default, all node pools are associated with every Project and Department using zero resource allocation. This means that by default any Project can use any node-pool if Over-Quota is set for that Project, but only for preemptible workloads (i.e. Training workloads or Interactive using Preemptible flag).</p> <ul> <li>To guarantee resources for all workloads including non-preemptible workloads, the administrator should allocate resources in node pools.</li> <li>Go to the Node Pools tab under Project and set a quota to any of the node pools (GPU resources, CPU resources) you want to use.</li> <li>To set the Project's default node pool's order of priority, you should set the precedence of each node pool, this is done in the Project's node pool tab.</li> <li>The node pool default priority order is used if the workload did not specify its preferred node pool(s) list of priority.</li> <li>To mandate a Workload to run on a specific node pool, the Researcher should specify the node pool to use for a workload.</li> <li>If no node-pool is specified - the Project's 'Default' node-pool priority list is used.</li> <li>Press 'Save' to save your changes.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#grouping-nodes-using-node-affinities","title":"Grouping Nodes using Node Affinities","text":"<p>To set node affinities, you must first annotate nodes with labels. These labels will later be associated with Projects.</p> <p>To get the list of nodes, run:</p> <pre><code>kubectl get nodes\n</code></pre> <p>To annotate a specific node with the label \"dgx-2\", run:</p> <pre><code>kubectl label node &lt;node-name&gt; run.ai/type=dgx-2\n</code></pre> <ul> <li>Each node can only be annotated with a single label.</li> <li>You can annotate multiple nodes with the same label.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#setting-affinity-for-a-specific-project","title":"Setting Affinity for a Specific Project","text":"<p>To mandate training Jobs to run on specific node groups:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Node Affinity tab and set a limit to specific node groups.</li> <li>If the label does not yet exist, press the + sign and add the label.</li> <li>Press Enter to save the label.</li> <li>Select the label.</li> </ul> <p>To mandate interactive Jobs to run on specific node groups, perform the same steps under the \"interactive\" section in the Project dialog.</p>"},{"location":"admin/admin-ui-setup/project-setup/#further-affinity-refinement-by-the-researcher","title":"Further Affinity Refinement by the Researcher","text":"<p>The Researcher can limit the selection of node groups by using the CLI flag <code>--node-type</code> with a specific label. When setting specific Project affinity, the CLI flag can only be used with a node group out of the previously chosen list.  See CLI reference for further information runai submit</p>"},{"location":"admin/admin-ui-setup/project-setup/#limit-duration-of-interactive-and-training-jobs","title":"Limit Duration of Interactive and Training Jobs","text":"<p>As interactive sessions involve human interaction, Run:ai provides an additional tool to enforce a policy that sets the time limit for such sessions. This policy is often used to handle situations like researchers leaving sessions open even when they do not need to access the resources.</p> <p>Warning</p> <p>This feature will cause containers to automatically stop. Any work not saved to a shared volume will be lost.</p> <p>To set a duration limit for interactive Jobs:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Time Limit tab</li> <li>You can limit interactive Jobs using two criteria:<ul> <li>Set a hard time limit (day, hour, minute) to an Interactive Job, regardless of the activity of this Job, e.g. stop the Job after 1 day of work.</li> <li>Set a time limit for Idle Interactive Jobs, i.e. an Interactive Job idle for X time is stopped. Idle means no GPU activity.</li> <li>You can set if this idle time limit is effective for Interactive Jobs that are Preemptible, non-Preemptible, or both.</li> </ul> </li> </ul> <p>The setting only takes effect for Jobs that have started after the duration has been changed.</p> <p>In some use cases, you would like to stop Training Jobs if X time elapsed since they have started to run. This can be to clean up stale Training Jobs or Jobs that are running for too long probably because of wrong parameters set or other errors of the model.</p> <p>To set a duration limit for Training Jobs:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Time Limit tab:<ul> <li>Set a time limit for Idle Training Jobs, i.e. a Training Job idle for X time is stopped. Idle means no GPU activity.</li> </ul> </li> </ul> <p>The setting only takes effect for Jobs that have started after the duration has been changed.</p>"},{"location":"admin/admin-ui-setup/project-setup/#see-also","title":"See Also","text":"<p>Run:ai supports an additional (optional) level of resource allocation called Departments.</p>"},{"location":"admin/integration/airflow/","title":"Integrate Run:ai with Apache Airflow","text":"<p>Airflow is a platform to programmatically author, schedule, and monitor workflows. Specifically, it is used in Machine Learning to create pipelines.  </p>"},{"location":"admin/integration/airflow/#airflow-dag","title":"Airflow DAG","text":"<p>In Airflow, a DAG \u2013 or a Directed Acyclic Graph \u2013 is a collection of all the tasks you want to run, organized in a way that reflects their relationships and dependencies.</p> <p>A DAG is defined in a Python script, which represents the DAGs structure (tasks and their dependencies) as code.</p> <p>For example, a simple DAG could consist of three tasks: A, B, and C. It could say that A has to run successfully before B can run, but C can run anytime. It could say that task A times out after 5 minutes, and B can be restarted up to 5 times in case it fails. It might also say that the workflow will run every night at 10 pm, but shouldn\u2019t start until a certain date.</p> <p>Airflow tasks are sent for execution. Specifically, the Airflow - Kubernetes integration allows Airflow tasks to be scheduled on a Kubernetes cluster. </p>"},{"location":"admin/integration/airflow/#runai-airflow-integration","title":"Run:ai - Airflow Integration","text":"<p>DAGs are defined in Python. Airflow tasks based on Kubernetes are defined via the KubernetesPodOperator class.  To run an Airflow task with Run:ai you must provide additional, Run:ai-related, properties to </p> <p><pre><code>dag = DAG(...)\nresources = {\n\"limit_gpu\": &lt;number-of-GPUs&gt;\n}\njob = KubernetesPodOperator(\nnamespace='runai-&lt;project-name&gt;',\nimage='&lt;image-name&gt;',\nlabels={\"project\": '&lt;project-name&gt;'},\nname='&lt;task-name&gt;',\ntask_id='&lt;task-name&gt;',\nget_logs=True,\nschedulername='runai-scheduler',\nresources=resources,\ndag=dag\n)\n</code></pre> The code:</p> <ul> <li>Specifies the runai-scheduler which directs the task to be scheduled with the Run:ai scheduler</li> <li>Specifies a Run:ai Project. A Project in Run:ai specifies guaranteed GPU &amp; CPU quota.  </li> </ul> <p>Once you run the DAG, you can see Airflow tasks shown in the Run:ai UI. </p>"},{"location":"admin/integration/argo-workflows/","title":"Integrate Run:ai with Argo Workflows","text":"<p>Argo Workflows is an open source container-native workflow engine for orchestrating parallel jobs on Kubernetes.</p> <p>This document describes the process of using Argo Workflows in conjunction with Run:ai. Argo Workflows submits jobs that are scheduled via Run:ai.</p>"},{"location":"admin/integration/argo-workflows/#install-argo-workflows","title":"Install Argo Workflows","text":"<p>Use the default installation to install Argo Workflows. As described in the documentation, open the Argo Workflows UI by running: </p> <pre><code>kubectl -n argo port-forward deployment/argo-workflows-server 2746:2746\n</code></pre> <p>Then browse to localhost:2746</p>"},{"location":"admin/integration/argo-workflows/#create-a-runai-project","title":"Create a Run:ai Project","text":"<p>Using the Run:ai user interface, create a Run:ai Project. A Project named <code>team-a</code> will create a Kubernetes namespace named <code>runai-team-a</code>.</p>"},{"location":"admin/integration/argo-workflows/#run-an-argo-workflow-with-runai","title":"Run an Argo Workflow with Run:ai","text":""},{"location":"admin/integration/argo-workflows/#create-an-argo-workflows-template","title":"Create an Argo Workflows Template","text":"<p>Within the Argo Workflows user interface, go to <code>Templates</code> and create a new Template. Add the following metadata:</p> <pre><code>spec:\ntemplates:\n- name: &lt;WORKFLOW-NAME&gt;\nmetadata:\nlabels:\nproject: team-a # (1)\n</code></pre> <ol> <li>Name of Project.</li> </ol>"},{"location":"admin/integration/argo-workflows/#create-and-run-the-workflow","title":"Create and Run the Workflow","text":"<p>Create an Argo Workflow from the template and run it. Open the Run:ai user interface, go to <code>Jobs</code>, and verify that you can see the new Job. </p>"},{"location":"admin/integration/argo-workflows/#using-gpu-fractions-with-argo-workflows","title":"Using GPU Fractions with Argo Workflows","text":"<p>To run an Argo Workflow using GPU Fractions, you will need to add an <code>annotation</code>:</p> <pre><code>spec:\ntemplates:\n- name: &lt;WORKFLOW-NAME&gt;\nmetadata:\nannotations:\ngpu-fraction: '0.5' # (1)\nlabels:\nproject: team-a # (2)\n</code></pre> <ol> <li>Size of required GPU Fraction.</li> <li>Name of Project.</li> </ol>"},{"location":"admin/integration/clearml/","title":"Integrate Run:ai with ClearML","text":"<p>ClearML is an open-source and commercial platform to manage the ML lifecycle. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler. </p>"},{"location":"admin/integration/clearml/#overview","title":"Overview","text":"<p>ClearML concepts are discussed here. Specifically see ClearML Kubernetes architecture.</p>"},{"location":"admin/integration/clearml/#terminology","title":"Terminology","text":"<ul> <li>Run:ai uses Projects. A Project is assigned to users and contains information such as quota, affinity, and more. A Run:ai Project is implemented as a Kubernetes namespace. </li> <li>ClearML allows the Researcher to run Experiments. Experiment is equivalent to a Run:ai Job. A ClearML Experiment is sent to a ClearML Queue for execution. </li> <li>ClearML execute Agents. An agent runs on a Kubernetes namespace. An Agent is configured to watch a Queue. The Agent fetches an experiment from the queue for execution within the Kubernetes namespace.</li> </ul>"},{"location":"admin/integration/clearml/#step-by-step-instructions","title":"Step by Step Instructions","text":""},{"location":"admin/integration/clearml/#prerequisites","title":"Prerequisites","text":"<ul> <li>A working Run:ai cluster.</li> <li>Install ClearML via ClearML helm charts. Once ClearML is installed, verify that the installation is working by running:</li> </ul> <pre><code>kubectl get pod -n clearml\n</code></pre> <p>See that all pods are up. </p>"},{"location":"admin/integration/clearml/#preparations","title":"Preparations","text":"<p>To prepare a Run:ai Project and a ClearML Queue do the following:</p> <ul> <li>In ClearML, create a queue named <code>runai-clearml</code>.</li> <li>In Run:ai, create a project named <code>clearml</code>. This will create a namespace called <code>runai-clearml</code></li> <li>Associate the queue and the project by running:</li> </ul> <pre><code>kubectl get role -n clearml k8sagent-pods-access -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get rolebinding -n clearml k8sagent-pods-access -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get secret -n clearml clearml-conf -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get configmap -n clearml k8sagent-pod-template -ojson | sed 's@tolerations:\\\\n    {}@tolerations:\\\\n    []@g' | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | jq '.data[\"template.yaml\"]=(.data[\"template.yaml\"] + \"  schedulerName: runai-scheduler\")' | kubectl create -f -\nkubectl get deployment -n clearml clearml-k8sagent -ojson | sed 's/clearml-apiserver/clearml-apiserver.clearml.svc.cluster.local/; s/clearml-webserver/clearml-webserver.clearml.svc.cluster.local/; s/clearml-fileserver/clearml-fileserver.clearml.svc.cluster.local/; s@--template-yaml /root/template/template.yaml@--template-yaml /root/template/template.yaml --namespace runai-clearml@; s/k8s-agent/runai-k8s-agent/; s/aws-instances/runai-clearml/' | jq 'del(.status)' | jq 'del(.metadata.creationTimestamp)' | jq 'del(.metadata.generation)' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq '.metadata.namespace=\"runai-clearml\"' | kubectl create -f -\n</code></pre> <p>Note</p> <p>The script is hardcoded for the above queue name and Run:ai Project name. You can change the script accordingly.</p> <p>Validate that the Queue and the Project are connected by running:</p> <pre><code>kubectl get pod -n runai-clearml\n</code></pre> <p>You should see a ClearML agent running inside the Run:ai namespace. </p>"},{"location":"admin/integration/clearml/#running-an-experiment","title":"Running an Experiment","text":"<ul> <li>Using the ClearML interface create an experiment and enqueue it to the <code>runai-clearml</code> queue.</li> <li>Go to the Run:ai user interface. Under <code>Jobs</code> see that the job was created. </li> </ul>"},{"location":"admin/integration/comet/","title":"Integrate Run:ai with Comet","text":"<p>Comet builds tools that help data scientists, engineers, and team leaders accelerate and optimize machine learning and deep learning models. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler.</p> <p>Data scientists and ML engineers choose the Comet platform because it has the flexibility required for the most iterative data science teams, and it is built to handle the intense demands of enterprise ML at scale.</p> <p>To configure Comet integration:</p> <ol> <li>Login to your account in Comet. If you do not have a valid account, you will need to create one.</li> <li>Setup your Comet account here.</li> <li>In your Run:ai account, create an environment and set Comet as a tool then:</li> <li>Enter the following <code>&lt;COMET_results_URL&gt;</code></li> <li>Add an environment variable:<pre><code>```Key = COMET_results_URL```\n\n```Value = enter the URL destination for the results```\n</code></pre> </li> </ol> <p>The researcher must then create a Workspace and select the Comet tool.</p> <p>To configure the Comet tool, for the environemnt variable name <code>COMET_results_URL</code> value, enter the ULR of the destination where the results are to be delivered.</p> <p>This will create a link, that will automatically open a new tab directly from your Workspace to your exact Comet project.</p>"},{"location":"admin/integration/deepspeed/","title":"DeepSpeed Integration with Run:ai","text":""},{"location":"admin/integration/deepspeed/#working-with-deepspeed-on-top-of-runai","title":"Working with DeepSpeed on top of Run:ai","text":"<p>DeepSpeed is a deep learning optimization library for PyTorch designed to reduce computing power and memory use, and to train large distributed models with better parallelism on existing computer hardware. DeepSpeed is optimized for low latency, high throughput training. It also includes the Zero Redundancy Optimizer (ZeRO) for training models with 1 trillion or more parameters. Other features include mixed precision training, single-GPU, multi-GPU, multi-node training, and custom model parallelism.</p> <p>This article describes how to run a distributed workload on Kubernetes using an MPIJob with DeepSpeed.</p>"},{"location":"admin/integration/deepspeed/#prerequisites","title":"Prerequisites","text":"<p>Prerequisites to run DeepSpeed on a Run:ai cluster:</p> <ul> <li> <p>Kubernetes Cluster version <code>1.21</code> or later</p> </li> <li> <p>Run:ai Cluster version <code>2.9</code> or later</p> </li> <li> <p>Kubeflow MPIOperator version <code>v0.4.0</code> or later</p> </li> </ul> <p>Note</p> <p>Earlier versions may work but weren't tested.</p>"},{"location":"admin/integration/deepspeed/#ai-workload-cifar10","title":"AI Workload - Cifar10","text":"<p>This article uses the  <code>Cifar 10</code> dataset to show how to work with DeepSpeed on Run:ai. This dataset contains thousands of images and an image recognition model. For more information about the <code>Cifar 10</code> model go to: CIFAR-10 and CIFAR-100 datasets(toronto.edu)</p> <p>Microsoft has released an examples of DeepSpeed training models in the following repository: microsoft/DeepSpeedExamples: Example models using DeepSpeed(github.com)</p> <p>We will use the <code>Cifar 10</code> model which can be found in <code>training/cifar</code> directory. In this directory we can find the following relevant files:</p> <ul> <li> <p><code>cifar10_tutorial.py</code>\u2014run the training without DeepSpeed.</p> </li> <li> <p><code>cifar10_deepspeed.py</code>\u2014run the training with DeepSpeed.</p> </li> <li> <p><code>run_ds.sh</code>\u2014Entrypoint for running the training.</p> </li> <li> <p><code>ds_config.json</code>\u2014DeepSpeed configuration file.</p> </li> </ul>"},{"location":"admin/integration/deepspeed/#docker-image","title":"Docker Image","text":"<p>A Docker image needs to be prepared so that the workload can be submitted. It will run a an MPIJob that supports distributed workloads using OpenMPI. The image also needs to have an SSH server for the workers, an SSH client for the launcher, and the model files for the remote commands.</p> <p>To create the Docker image:</p> <pre><code>FROM deepspeed/deepspeed\nWORKDIR /home/deepspeed        #inherit from deepspeed/deepspeed as base image, for having the DeepSpeed tools available\nRUN git clone https://github.com/microsoft/DeepSpeedExamples.git #imports the model files to the image\nWORKDIR /home/deepspeed/DeepSpeedExamples/training/cifar\nRUN pip install -r requirements.txt #install dependencies\nRUN ssh-keygen -t rsa -N \\\"\\\" -f /home/deepspeed/.ssh/id_rsa #generate SSH keys\nRUN cp /home/deepspeed/.ssh/id_rsa.pub\n/home/deepspeed/.ssh/authorized_keys\nRUN sudo chmod 644 /etc/ssh/\\*\nRUN sudo chmod 700 /home/deepspeed\nRUN sudo chmod 700 /home/deepspeed/.ssh\nRUN sudo mkdir /tmp\nRUN sudo chmod 777 /tmp\n</code></pre> <p>After completing the configuration, use the following command to build the image:</p> <pre><code>docker build . -t gcr.io/run-ai-lab/user/deepspeed\n</code></pre>"},{"location":"admin/integration/deepspeed/#interactive-workflow","title":"Interactive Workflow","text":"<p>Use the UI, CLI, or the API to run the workload.</p> <p>Using the CLI:</p> <pre><code>runai submit-dist mpi \\--workers 2 -i\ngcr.io/run-ai-lab/user/deepspeed-example -g 1 \\--command -p team-a \\--\nsleep infinity\n</code></pre> <p>This command runs an MPI job with 2 processes (workers), each with 1 GPU. The entry point for the launcher is <code>sleep infinity</code> and provides access to the container.</p> <p>In the CLI, run the DeepSpeed command as follows:</p> <pre><code>deepspeed --hostfile /etc/mpi/hostfile cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json\n</code></pre> <p>Note</p> <p>Typically DeepSpeed looks for the <code>hostfile</code> in <code>/job/hostfile</code>. However, MPIOperator is</p> <p>puts the file in <code>/etc/mpi/hostfile</code>.</p>"},{"location":"admin/integration/jupyterhub/","title":"Connect JupyterHub with Run:ai","text":""},{"location":"admin/integration/jupyterhub/#overview","title":"Overview","text":"<p>A Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code. Uses include data cleaning and transformation, numerical simulation, statistical modeling, data visualization, machine learning, and much more. Jupyter Notebooks are popular with Researchers as a way to code and run deep-learning code. A Jupyter Notebook runs inside the user container. For more information, see Using a Jupyter Notebook within a Run:ai Job.</p> <p>JupyterHub is a separate service that makes it possible to serve pre-configured data science environments. </p> <p>This document explains how to set up JupyterHub to integrate with Run:ai such that Notebooks spawned via JuptyerHub will use resources scheduled by Run:ai.</p>"},{"location":"admin/integration/jupyterhub/#installing-jupyterhub","title":"Installing JupyterHub","text":"<p>This document follows the JupyterHub installation documentation</p>"},{"location":"admin/integration/jupyterhub/#create-a-namespace","title":"Create a namespace","text":"<p>Run:</p> <pre><code>kubectl create namespace jhub\n</code></pre>"},{"location":"admin/integration/jupyterhub/#provide-access-roles","title":"Provide access roles","text":"<pre><code>kubectl apply -f https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/jhubroles.yaml\n</code></pre>"},{"location":"admin/integration/jupyterhub/#create-storage","title":"Create storage","text":"<p>JupyterHub requires storage in the form of a PersistentVolume (PV). For an example of a local PV:</p> <ul> <li>Download https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/pv-example.yaml </li> <li>Replace <code>&lt;NODE-NAME&gt;</code> with one of your worker nodes. </li> <li>The example PV refers to <code>/srv/jupyterhub</code>. Log on to <code>&lt;NODE-NAME&gt;</code> and create the folder and run <code>sudo chmod 777 -R /srv/jupyterhub</code></li> </ul> <p>Then run:</p> <pre><code>kubectl apply -f pv-example.yaml \n</code></pre> <p>Note</p> <p>The JupyterHub installation will create a PersistentVolumeClaim named <code>hub-db-dir</code> that should be referred to by any PV you create.</p>"},{"location":"admin/integration/jupyterhub/#create-a-configuration-file","title":"Create a configuration file","text":"<p>Create a configuration file for JupyterHub. An example configuration file for Run:ai can be found in https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/config.yaml. It contains 3 sample Run:ai configurations. </p> <ul> <li>Download the file </li> <li>Replace <code>&lt;SECRET-TOKEN&gt;</code> with a random number generated, by running <code>openssl rand -hex 32</code></li> </ul>"},{"location":"admin/integration/jupyterhub/#install","title":"Install","text":"<p>Run:</p> <pre><code>helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/\nhelm repo update\nhelm install jhub jupyterhub/jupyterhub -n jhub  --version=0.11.1 --values config.yaml\n</code></pre>"},{"location":"admin/integration/jupyterhub/#verify-installation","title":"Verify Installation","text":"<p>Run: </p> <pre><code>kubectl get pods -n jhub\n</code></pre> <p>Verify that all pods are running</p>"},{"location":"admin/integration/jupyterhub/#access-jupyterhub","title":"Access JupyterHub","text":"<p>Run:</p> <pre><code>kubectl get service -n jhub proxy-public\n</code></pre> <p>Use the <code>External IP</code> of the service to access the service.</p> <p>Login with Run:ai Project name as user name.</p>"},{"location":"admin/integration/jupyterhub/#troubleshooting-the-jupyterhub-installation","title":"Troubleshooting the JupyterHub Installation","text":"<p>If the <code>External IP</code> of the proxy-public service remains in the <code>Pending</code> status, it might mean that this service is not configured with an <code>External IP</code> by default.</p> <p>To fix, find out which pod is the proxy pod running on.</p> <p>Run: </p> <pre><code>kubectl get pods -n jhub -l component=proxy -o=jsonpath='{.items[0].spec.nodeName}{\"\\n\"}'\n</code></pre> <p>This will print the node that the proxy pod is running on. You will need to get both the internal and external IPs of this node for the next step. </p> <p>Now, let's check the proxy-public service definition. Run:</p> <pre><code>kubectl edit svc proxy-public -n jhub\n</code></pre> <p>Under <code>spec</code> You should see a section <code>externalIPs</code>. If it does not exist, you must add it there. The section must contain both the external and the internal IPs of the proxy pod, for example:</p> <pre><code>spec:\nexternalIPs:\n- 35.224.44.230\n- 10.8.0.9\n</code></pre> <p>Save the file and then try to access JupyterHub by using the external IP from the previous step in your browser.</p> <p>Caution</p> <p>Jupyter hub integration does not currently work properly when the Run:ai Project name includes a hyphen ('-'). We are working to fix that. </p>"},{"location":"admin/integration/kubeflow/","title":"Integrate Run:ai with Kubeflow","text":"<p>Kubeflow is a platform for data scientists who want to build and experiment with ML pipelines. Kubeflow is also for ML engineers and operational teams who want to deploy ML systems to various environments for development, testing, and production-level serving.</p> <p>This document describes the process of using Kubeflow in conjunction with Run:ai. Kubeflow submits jobs that are scheduled via Run:ai.</p> <p>Kubeflow is a set of technologies. This document discusses Kubeflow Notebooks and Kubeflow Pipelines.</p>"},{"location":"admin/integration/kubeflow/#install-kubeflow","title":"Install Kubeflow","text":"<p>Use the default installation to install Kubeflow.</p>"},{"location":"admin/integration/kubeflow/#install-runai-cluster","title":"Install Run:ai Cluster","text":"<p>When installing Run:ai, customize the cluster installation as follows:</p> <ul> <li>Set <code>createNamespaces</code> to <code>false</code>, as Kubeflow uses its own namespace convention.</li> </ul>"},{"location":"admin/integration/kubeflow/#create-runai-projects","title":"Create Run:ai Projects","text":"<p>Kubeflow uses the namespace convention <code>kubeflow-&lt;username&gt;</code>. Use the 4 steps here to set up Run:ai projects and link them with Kubeflow namespaces. </p> <p>Verify that the association has worked by running:</p> <pre><code>kubectl get rolebindings -n &lt;KUBEFLOW-USER-NAMESPACE&gt;\n</code></pre> <p>See that role bindings starting with <code>runai-</code> were created.</p>"},{"location":"admin/integration/kubeflow/#kubeflow-users-and-kubernetes-namespaces","title":"Kubeflow, Users and Kubernetes Namespaces","text":"<p>Kubeflow has a multi-user architecture. A user has a Kubeflow profile which maps to a Kubernetes Namespace. This is similar to the Run:ai concept where a Run:ai Project is mapped to a Kubernetes namespace.</p>"},{"location":"admin/integration/kubeflow/#kubeflow-notebooks","title":"Kubeflow Notebooks","text":"<p>When starting a Kubeflow Notebook, you select a <code>Kubeflow configuration</code>. A Kubeflow configuration allows you to inject additional settings into the notebook, such as environment variables. To use Kubeflow with Run:ai you will use configurations to inject:</p> <ul> <li>The name of the Run:ai project</li> <li>Allocation of a fraction of a GPU, if required</li> </ul>"},{"location":"admin/integration/kubeflow/#whole-gpus","title":"Whole GPUs","text":"<p>To use Run:ai with whole GPUs (no fractions), apply the following configuration:</p> <pre><code>apiVersion: kubeflow.org/v1alpha1\nkind: PodDefault\nmetadata:\nname: runai-non-fractional\nnamespace: &lt;KUBEFLOW-USER-NAMESPACE&gt;\nspec:\ndesc: \"Use Run:ai scheduler (whole GPUs)\"\nenv:\n- name: RUNAI_PROJECT value: \"&lt;PROJECT&gt;\"\nselector:\nmatchLabels:\nrunai-non-fractional: \"true\"  # key must be identical to metadata.name\n</code></pre> <p>Where <code>&lt;KUBEFLOW-USER-NAMESPACE&gt;</code> is the name of the namespace associated with the Kubeflow user and <code>&lt;PROJECT&gt;</code> is the name of the Run:ai project.</p> <p>Important</p> <p>Jobs should not be submitted within the same namespace where Kubeflow Operator is installed.</p> <p>Within the Kubeflow Notebook creation form, select the new configuration as well as the number of GPUs required.</p>"},{"location":"admin/integration/kubeflow/#fractions","title":"Fractions","text":"<p>The Kubeflow Notebook creation form only allows the selection of 1, 2, 4, or 8 GPUs. It is not possible to select a portion of a GPU (e.g. 0.5). As such, within the form, select <code>None</code> in the GPU box together with the following configuration:</p> <p><pre><code>apiVersion: kubeflow.org/v1alpha1\nkind: PodDefault\nmetadata:\nname: runai-half-gpu\nnamespace: &lt;KUBEFLOW-USER-NAMESPACE&gt;\nspec:\ndesc: \"Allocate 0.5 GPUs via Run:ai scheduler\"\nenv:\n- name: RUNAI_PROJECT value: \"&lt;PROJECT&gt;\"\n- name: RUNAI_GPU_FRACTION\nvalue: \"0.5\"\nselector:\nmatchLabels:\nrunai-half-gpu: \"true\"  # key must be identical to metadata.name\n</code></pre> Similar configurations can be created for fractional configurations, other than 0.5. </p>"},{"location":"admin/integration/kubeflow/#kubeflow-pipelines","title":"Kubeflow Pipelines","text":"<p>Kubeflow Pipelines is a platform for building and deploying portable, scalable machine learning (ML) workflows based on Docker containers.</p> <p>As with Kubeflow Notebooks, the goal of this section is to run pipelines jobs within the context of Run:ai.</p> <p>To create a Kubeflow pipeline, you:</p> <ul> <li>Write code using the Kubeflow Pipeline SDK. </li> <li>Package it into a single compressed file.</li> <li>Upload the file into Kubeflow and set it up.</li> </ul> <p>The example code provided here shows how to augment pipeline code to use Run:ai</p>"},{"location":"admin/integration/kubeflow/#whole-gpus_1","title":"Whole GPUs","text":"<p>To the pipeline code add:</p> <pre><code>_training = training_op()\n...\n_training.add_pod_label('runai', 'true')\n_training.add_pod_label('project', '&lt;PROJECT&gt;')\n</code></pre> <p>Where <code>&lt;Project&gt;</code> is the Run:ai project name. See example code here</p> <p>Compile the code by running:</p> <p><pre><code>dsl-compile --py kubeflow-runai-one-gpu.py --output kubeflow-runai-one-gpu.tar.gz\n</code></pre> (dsl-compile is part of the Kubeflow Pipeline Python SDK).</p>"},{"location":"admin/integration/kubeflow/#fractions_1","title":"Fractions","text":"<p>To allocate half a GPU, add the following to the pipeline code:</p> <pre><code>_training = training_op()\n...\n_training.add_pod_label('runai', 'true')\n_training.add_pod_label('project', '&lt;PROJECT&gt;')\n_training.add_pod_annotation('gpu-fraction', '0.5')\n</code></pre> <p>Where <code>&lt;Project&gt;</code> is the Run:ai project name. See example code here.</p> <p>Compile the code as described above. </p>"},{"location":"admin/integration/kubevirt/","title":"Scheduling Virtual Machines using Run:ai","text":"<p>Many organizations use virtual machines (VMs) to provide operating system abstraction to users. Containers are different than VMs but serve a similar purpose. Containers at a large scale are best managed by Kubernetes and Run:ai is based on Kubernetes. </p> <p>It is possible to mix and match containers and VMs to some extent using a technology called KubeVirt. KubeVirt allows running VMs inside containers on top of Kubernetes. </p> <p>This article describes how to use KubeVirt to schedule VMs with GPUs.</p>"},{"location":"admin/integration/kubevirt/#limitations","title":"Limitations","text":"<p>Each node in the cluster will be able to support either VMs or containers - not combined.</p> <p>GPU fractions are not supported. </p>"},{"location":"admin/integration/kubevirt/#preparations","title":"Preparations","text":"<p>Making GPUs visible to VMs is not trivial. It requires either a license for NVIDIA software called NVIDIA vGPU or creating a GPU passthrough by the explicit mapping of GPU devices to virtual machines. This guide relates to the latter option. </p>"},{"location":"admin/integration/kubevirt/#install-kubevirt","title":"Install KubeVirt","text":"<p>Install KubeVirt using the following guide.</p>"},{"location":"admin/integration/kubevirt/#dedicate-specific-nodes-for-vms","title":"Dedicate specific nodes for VMs","text":"<p>Dedicate specific nodes within the cluster to be used for VMs and not containers - following the guide.</p> <p>Specifically, restrict <code>virt-controller</code>, <code>virt-api</code> and <code>virt-handler</code> pods to only run on the nodes you want to be used for VMs.</p>"},{"location":"admin/integration/kubevirt/#assign-host-devices-to-virtual-machines","title":"Assign host devices to virtual machines","text":"<p>For each node in the cluster that we want to use with VMs we must:</p> <ul> <li>Identify all GPU cards we want to dedicate to be used by VMs.</li> <li>Map GPU cards for KubeVirt to pick up (called assigning host devices to a virtual machine).</li> </ul> <p>Instructions for identifying GPU cards are operating-system-specific. For Ubuntu 20.04 run:</p> <pre><code>lspci -nnk -d 10de:\n</code></pre> <p>Search for GPU cards that are marked with the text Kernel driver in use. Save the PCI Address, for example: 10de:1e04</p> <p>Important</p> <p>Once exposed, these GPUs cannot be used by regular pods. Only VMs. </p> <p>To expose the GPUs and map them to KubeVirt follow the instructions here. Specifically, run:</p> <pre><code>kubectl edit kubevirt -n kubevirt -o yaml\n</code></pre> <p>And add all of the PCI Addresses of all GPUs of all Nodes concatenated by commas, with the resource name kubevirt/vmgpu:</p> <pre><code>spec:\ncertificateRotateStrategy: {}\nconfiguration:\ndeveloperConfiguration:\nfeatureGates:\n- GPU\n- HostDevices\npermittedHostDevices:\npciHostDevices:\n- pciVendorSelector: &lt;PCI-ADDRESS&gt;,&lt;PCI-ADDRESS&gt;,\nresourceName: kubevirt/vmgpu\n</code></pre>"},{"location":"admin/integration/kubevirt/#assign-gpus-to-vms","title":"Assign GPUs to VMs","text":"<p>You must create a CRD called vm for each virtual machine. <code>vm</code> is a reference to a virtual machine and its capabilities.</p> <p>The Run:ai project is matched to a Kubernetes namespace. Unless manually configured, the namespace is <code>runai-&lt;PROJECT-NAME&gt;</code>. Per Run:ai Project, create a <code>vm</code> object. See KubeVirt documentation example. Specifically, the created YAML should look like this:</p> <pre><code>spec:\nrunning: false\ntemplate:\nmetadata:\ncreationTimestamp: null\nlabels:\n....\npriorityClassName: &lt;WORKLOAD-TYPE&gt;\nproject: &lt;PROJECT-NAME&gt;\nspec:\nschedulerName: runai-scheduler\ndomain:\ndevices:\ngpus:\n- deviceName: kubevirt/vmgpu # identical name to resourceName above\nname: gpu1  # name here is arbitrary and is not used. \n</code></pre> <p>Where <code>&lt;WORKLOAD-TYPE&gt;</code> is <code>train</code> or <code>build</code></p>"},{"location":"admin/integration/kubevirt/#turn-on-kubevirt-feature-in-runai","title":"Turn on KubeVirt feature in Run:ai","text":"<ul> <li> <p>If you want to upgrade the runai cluster, use the instructions. </p> <ul> <li>During the upgrade, customize the cluster installation by adding the following to the values.yaml file:</li> </ul> <pre><code>global:\nkubevirtCluster:\nenabled: true\n</code></pre> </li> <li> <p>If you don't want to upgrade the whole cluster, you can add those values to your existing values.yaml file.</p> <ul> <li>Then, run the command:</li> </ul> <pre><code>helm upgrade runai-cluster runai/runai-cluster -n runai -f values.yaml\n</code></pre> </li> <li> <p>Make sure the <code>kubevirtCluster: enabled</code> flag is still turned on in <code>runaiconfig</code>:</p> <pre><code>kubectl edit runaiconfig runai -n runai\n</code></pre> </li> </ul>"},{"location":"admin/integration/kubevirt/#start-a-vm","title":"Start a VM","text":"<p>Run:</p> <pre><code>virtctl start testvm -n runai-test\n</code></pre> <p>You can now see the VMs pod in Run:ai.</p> <pre><code>runai list -A\nNAME    STATUS   AGE  NODE         IMAGE                                   TYPE  PROJECT  USER  GPUs Allocated (Requested)  PODs Running (Pending)  SERVICE URL(S)\ntestvm  Running  0s   master-node  quay.io/kubevirt/virt-launcher:v0.47.1        test           1 (1)                       1 (0)\n</code></pre>"},{"location":"admin/integration/messaging/","title":"Run:ai Event Router","text":"<p>The Run:ai Event Router repository is a wrapper chart of kubernetes-event-exporter open source project configured specifically to trigger alerts from Run.Ai cluster.</p> <p>Note</p> <p>The Event Router currently is configured out of the box only for parsing Run:ai scheduler events to Slack.</p>"},{"location":"admin/integration/messaging/#configure-slack-notifications","title":"Configure Slack notifications","text":"<p>To configure Slack notifications:</p> <ol> <li> <p>Create a new slack app here. The slack app is used to trigger notifications to a channel and to generate auth token to the event-router.</p> <p>After pressing on the create new app button, you should have the option to create the app from an <code>app manifest</code>.</p> </li> <li> <p>Copy the following slack manifest definition in order to create <code>runai-event-router-app</code>:</p> <pre><code>display_information:\nname: runai-event-router\ndescription: This app is used by runai-event-router for sending notifications through slack\nbackground_color: \"#141f40\"\nfeatures:\nbot_user:\ndisplay_name: runai-event-router-app\nalways_online: false\noauth_config:\nscopes:\nbot:\n- chat:write\n- incoming-webhook\n- chat:write.public\nsettings:\norg_deploy_enabled: false\nsocket_mode_enabled: false\ntoken_rotation_enabled: false\n</code></pre> <p>After creating your app go to the \"OAuth &amp; Permissions\" section in your app and grab the <code>Bot User OAuth Token</code>, in the following step paste the token in the values.yaml.</p> <p>Note</p> <p>In order to create the slack app without a manifest you can follow slack docs: guide to creating Slack apps with bot tokens.</p> </li> <li> <p>Clone the Run.Ai Event Router repository and edit the <code>values.yaml</code> file.</p> <pre><code>runaiProjects: - my_runai_project\nclusterName: \"\"\nslack:\nenabled: true\napiToken: \"\"\nchannel: \"\"\n</code></pre> <p>runaiProjects\u2014projects listed here will send notifications. Use the regex pattern <code>.*</code> to find all the projects with the same starting name.</p> <p>clusterName\u2014name of the cluster to show in the Slack notifications.</p> <p>slack</p> <p><code>enabled</code>\u2014Enable slack integration</p> <p><code>apiToken</code>\u2014Slack bot token, configured with 'chat:write'   permissions. (see previous section)</p> <p><code>channel</code>\u2014A destination channel 'runai-notifications' a direct message '@bob.marly' or dynamic by setting 'pod-project'.</p> <p>If the value of <code>channel</code> is set to pod-project, the event router will try to send the notification by tagging '@project' where project is taken from the 'project' label attached to your pod by runai.</p> </li> <li> <p>After configuring the <code>values.yaml</code> file, run the following commands to deploy the chart:</p> <pre><code>chmod +x post-process.sh\nhelm install runai-event-router . -n runai-monitoring --create-namespace --post-renderer ./post-process.sh\n</code></pre> </li> </ol>"},{"location":"admin/integration/mlflow/","title":"Integrate Run:ai with MLflow","text":"<p>MLflow is an open-source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler. </p>"},{"location":"admin/integration/mlflow/#overview","title":"Overview","text":"<p>MLflow concepts and alternative architectures are discussed here. MLflow can run on various platforms. To work with Run:ai we would use the MLflow Kubernetes integration.</p> <p>The MLflow documentation describes the Kubernetes integration as such:</p> <p>Quote</p> <p>When you run an MLflow Project on Kubernetes, MLflow constructs a new Docker image containing the Project\u2019s contents; this image inherits from the Project\u2019s Docker environment. MLflow then pushes the new Project image to your specified Docker registry and starts a Kubernetes Job on your specified Kubernetes cluster. This Kubernetes Job downloads the Project image and starts a corresponding Docker container. Finally, the container invokes your Project\u2019s entry point, logging parameters, tags, metrics, and artifacts to your MLflow tracking server.</p> <p>To run an MLflow job via Kubernetes, you specify an MLflow Kubernetes configuration file that contains a template. Here is an example from the MLflow documentation:</p> <pre><code>{\n\"kube-context\": ...,\n\"repository-uri\": ...,\n\"kube-job-template-path\": \"/username/path/to/kubernetes_job_template.yaml\"\n}\n</code></pre> <p>The essence of the Run:ai integration is the modification of the <code>kubernetes_job_template.yaml</code> file. Specifically adding the Run:ai scheduler name and the Run:ai Project (Kubernetes namespace).</p>"},{"location":"admin/integration/mlflow/#step-by-step-instructions","title":"Step by Step Instructions","text":""},{"location":"admin/integration/mlflow/#prerequisites","title":"Prerequisites","text":"<ul> <li>Install MLflow.</li> <li>Make sure you have push access to a Docker repository from your local machine.</li> <li>Make sure you are connected to Run:ai via the Run:ai Command-line interface.</li> </ul>"},{"location":"admin/integration/mlflow/#the-sample-mlflow-project","title":"The sample MLflow Project","text":"<p>The relevant sample files are here. These contain:</p> <ul> <li>A <code>Dockerfile</code>. This file builds a base docker image containing python3 and the required MLflow dependencies. The Docker file is already compiled and available at <code>gcr.io/run-ai-demo/mlflow-demo</code>.</li> <li>An MLflow project file <code>MLproject</code>. The project file contains the base image above as well as the python command-line to run. </li> <li>The training python code <code>train.py</code></li> <li>MLflow Kubernetes configuration files as in the MLflow documentation.<ul> <li>Kubernetes configuration file <code>kubernetes_config.json</code></li> <li>An MLflow Kubernetes Job template <code>kubernetes_job_template.yaml</code> </li> </ul> </li> </ul>"},{"location":"admin/integration/mlflow/#preparations","title":"Preparations","text":"<ul> <li>Edit <code>kubernetes_config.json</code>. <ul> <li>Set <code>kube-context</code> to the name of the Kubernetes context. You can find the context name by running <code>runai list clusters</code> or <code>kubectl config get-contexts</code>.</li> <li>Set <code>repository-uri</code> to a repository and name of a docker image that will be used by MLflow (this is a different image than the base docker image described above). Your local machine needs permissions to be able to push this image to the Docker registry.</li> </ul> </li> <li>Edit <code>kubernetes_job_template.yaml</code>. <ul> <li>Set the value of <code>namespace</code> to <code>runai-&lt;name of Run:ai project&gt;</code>. </li> <li>Note the last line which adds the Run:ai scheduler to the configuration. </li> <li>Do not change the lines marked by <code>{replaced with...</code>.</li> <li>Set the requested resources including GPUs. You can use the <code>--dry-run</code> flag of the runai submit command to gain insight on additional configurations</li> </ul> </li> </ul>"},{"location":"admin/integration/mlflow/#running","title":"Running","text":"<ul> <li>Perform <code>docker login</code> if required.</li> <li>Run:</li> </ul> <pre><code>mlflow run mlproject -P alpha=5.0  -P l1-ratio=0.1  \\\n    --backend kubernetes --backend-config kubernetes_config.json\n</code></pre>"},{"location":"admin/integration/mlflow/#mlflow-tracking","title":"MLflow Tracking","text":"<p>The sample training code above does not contain references to an MLflow tracking server. This has been done to simplify the required setup. With MLflow-Kubernetes you will need a remote server architecture. Once you have such an architecture set up, you can use MLflow Tracking in your code.</p>"},{"location":"admin/integration/mlflow/#using-interactive-workloads","title":"Using Interactive Workloads","text":"<p>With Run:ai you can also run interactive workloads. To run the Job as interactive, add the following to <code>kubernetes_job_template.yaml</code>: </p> <pre><code>metadata:\nlabels:\npriorityClassName: \"build\"\n</code></pre>"},{"location":"admin/integration/mlflow/#see-also","title":"See Also","text":"<ul> <li>You can use MLflow together with Fractional GPUs. For more information see Launch Job via YAML.</li> <li>To map additional Run:ai options to the YAML, see Launch Job via YAML.</li> </ul>"},{"location":"admin/integration/ray/","title":"Integrate Run:ai with Ray","text":"<p>Ray is an open-source unified framework for scaling AI and Python applications like machine learning. It provides the compute layer for parallel processing so that you don\u2019t need to be a distributed systems expert.</p>"},{"location":"admin/integration/ray/#sumitting-ray-jobs","title":"Sumitting Ray jobs","text":"<p>You must install KubeRay version 0.5.0 or greater in order to work with the different types of Ray workloads.</p> <p>Use the following commands:</p> <pre><code>helm repo add kuberay https://ray-project.github.io/kuberay-helm/\n\nhelm install kuberay-operator kuberay/kuberay-operator -n kuberay-operator --version 0.5.0 --create-namespace\n</code></pre> <p>For more information, see Deploying RayKube operator.</p>"},{"location":"admin/integration/ray/#submit-ray-jobs","title":"Submit Ray jobs","text":"<p>Run:AI integrates with ray by interacting with the kuberay CRDs (RayJob, RayServe and RayCluster). The following is an example of RayJob scheduled by Run:AI. Use the following command to submit your Ray job:</p> <p><code>kubectl apply -f &lt;path/example-file-name.yaml&gt;</code></p> <p>For more information, see Run an example job.</p> <p>Example:</p> <pre><code>apiVersion: ray.io/v1alpha1\nkind: RayJob\nmetadata:\n  name: &lt;name&gt;\n  namespace: &lt;your_project_namespace&gt;\nspec:\n  entrypoint: python /home/ray/samples/sample_code.py\n  # runtimeEnv decoded to '{\n  #    \"pip\": [\n  #        \"requests==2.26.0\",\n  #        \"pendulum==2.1.2\"\n  #    ],\n  #    \"env_vars\": {\n  #        \"counter_name\": \"test_counter\"\n  #    }\n  #}'\n  shutdownAfterJobFinishes: true\n  #ttlSecondsAfterFinished: 30\n  runtimeEnv: ewogICAgInBpcCI6IFsKICAgICAgICAicmVxdWVzdHM9PTIuMjYuMCIsCiAgICAgICAgInBlbmR1bHVtPT0yLjEuMiIKICAgIF0sCiAgICAiZW52X3ZhcnMiOiB7ImNvdW50ZXJfbmFtZSI6ICJ0ZXN0X2NvdW50ZXIifQp9Cg==\n  rayClusterSpec:\n    rayVersion: '2.3.0' # should match the Ray version in the image of the containers\n    # Ray head pod template\n    headGroupSpec:\n      serviceType: ClusterIP # optional\n      # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...\n      rayStartParams:\n        dashboard-host: '0.0.0.0'\n        num-cpus: '2' # can be auto-completed from the limits\n        block: 'true'\n      #pod template\n      template:\n        spec:\n          containers:\n            - name: ray-head\n              image: rayproject/ray:2.3.0\n              ports:\n                - containerPort: 6379\n                  name: gcs-server\n                - containerPort: 8265 # Ray dashboard\n                  name: dashboard\n                - containerPort: 10001\n                  name: client\n                - containerPort: 8000\n                  name: serve\n              volumeMounts:\n                - mountPath: /home/ray/samples\n                  name: code-sample\n          schedulerName: runai-scheduler\n          volumes:\n            # You set volumes at the Pod level, then mount them into containers inside that Pod\n            - name: code-sample\n              configMap:\n                # Provide the name of the ConfigMap you want to mount.\n                name: ray-job-code-sample\n                # An array of keys from the ConfigMap to create as files\n                items:\n                  - key: sample_code.py\n                    path: sample_code.py\n    workerGroupSpecs:\n      # the pod replicas in this group typed worker\n      - replicas: 1 # example\n        minReplicas: 1 # example\n        maxReplicas: 5 # example\n        # logical group name, for this called small-group, also can be functional\n        groupName: small-group\n        rayStartParams:\n          block: 'true'\n        #pod template\n        template:\n          spec:\n            initContainers:\n              # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name\n              - name: init\n                image: busybox:1.28\n                command: [ 'sh', '-c', \"until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done\" ]\n            containers:\n              - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'\n                image: rayproject/ray:2.3.0\n                lifecycle:\n                  preStop:\n                    exec:\n                      command: [ \"/bin/sh\",\"-c\",\"ray stop\" ]\n            schedulerName: runai-scheduler\n######################Ray code sample#################################\n# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example\n# it is mounted into the container and executed to show the Ray job at work\n---\napiVersion: v1\nkind: ConfigMap\nmetadata:\n  name: ray-job-code-sample\ndata:\n  sample_code.py: |\n    import ray\n    import os\n    import requests\n\n    ray.init()\n\n    @ray.remote\n    class Counter:\n        def __init__(self):\n            # Used to verify runtimeEnv\n            self.name = os.getenv(\"counter_name\")\n            self.counter = 0\n\n        def inc(self):\n            self.counter += 1\n\n        def get_counter(self):\n            return \"{} got {}\".format(self.name, self.counter)\n\n    counter = Counter.remote()\n\n    for _ in range(5):\n        ray.get(counter.inc.remote())\n        print(ray.get(counter.get_counter.remote()))\n\n    print(requests.__version__)\n</code></pre>"},{"location":"admin/integration/ray/#ray-autoscaling-cluster","title":"Ray autoscaling cluster","text":"<p>For more information, see Ray autoscaling.</p> <p>Use the following command to submit your Ray autoscaling cluster:</p> <p><code>kubectl apply -f &lt;path/example-file-name.yaml&gt;</code></p> <p>Example:</p> <pre><code># This config demonstrates KubeRay's Ray autoscaler integration.\n# The resource requests and limits in this config are too small for production!\n# For an example with more realistic resource configuration, see\n# ray-cluster.autoscaler.large.yaml.\napiVersion: ray.io/v1alpha1\nkind: RayCluster\nmetadata:\n  labels:\n    controller-tools.k8s.io: \"1.0\"\n    # A unique identifier for the head node and workers of this cluster.\n  name: &lt;name&gt;\n  namespace: &lt;your_project_namespace&gt;\nspec:\n  # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.\n  rayVersion: '2.3.0'\n  # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.\n  # Ray autoscaler integration is supported only for Ray versions &gt;= 1.11.0\n  # Ray autoscaler integration is Beta with KubeRay &gt;= 0.3.0 and Ray &gt;= 2.0.0.\n  enableInTreeAutoscaling: true\n  # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.\n  # The example configuration shown below below represents the DEFAULT values.\n  # (You may delete autoscalerOptions if the defaults are suitable.)\n  autoscalerOptions:\n    # upscalingMode is \"Default\" or \"Aggressive.\"\n    # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.\n    # Default: Upscaling is not rate-limited.\n    # Aggressive: An alias for Default; upscaling is not rate-limited.\n    upscalingMode: Default\n    # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.\n    idleTimeoutSeconds: 60\n    # image optionally overrides the autoscaler's container image.\n    # If instance.spec.rayVersion is at least \"2.0.0\", the autoscaler will default to the same image as\n    # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.\n    ## image: \"my-repo/my-custom-autoscaler-image:tag\"\n    # imagePullPolicy optionally overrides the autoscaler container's default image pull policy (IfNotPresent).\n    imagePullPolicy: IfNotPresent\n    # Optionally specify the autoscaler container's securityContext.\n    securityContext: {}\n    env: []\n    envFrom: []\n    # resources specifies optional resource request and limit overrides for the autoscaler container.\n    # The default autoscaler resource limits and requests should be sufficient for production use-cases.\n    # However, for large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.\n    resources:\n      limits:\n        cpu: \"500m\"\n        memory: \"512Mi\"\n      requests:\n        cpu: \"500m\"\n        memory: \"512Mi\"\n  # Ray head pod template\n  headGroupSpec:\n    serviceType: ClusterIP # optional\n    # the following params are used to complete the ray start: ray start --head --block ...\n    rayStartParams:\n      dashboard-host: '0.0.0.0'\n      block: 'true'\n      # num-cpus: '1' # can be auto-completed from the limits\n      # Use `resources` to optionally specify custom resource annotations for the Ray node.\n      # The value of `resources` is a string-integer mapping.\n      # Currently, `resources` must be provided in the specific format demonstrated below:\n      # resources: '\"{\\\"Custom1\\\": 1, \\\"Custom2\\\": 5}\"'\n    #pod template\n    template:\n      spec:\n        containers:\n        # The Ray head container\n        - name: ray-head\n          image: rayproject/ray:2.3.0\n          ports:\n          - containerPort: 6379\n            name: gcs\n          - containerPort: 8265\n            name: dashboard\n          - containerPort: 10001\n            name: client\n          lifecycle:\n            preStop:\n              exec:\n                command: [\"/bin/sh\",\"-c\",\"ray stop\"]\n          # The resource requests and limits in this config are too small for production!\n          # For an example with more realistic resource configuration, see\n          # ray-cluster.autoscaler.large.yaml.\n          # It is better to use a few large Ray pod than many small ones.\n          # For production, it is ideal to size each Ray pod to take up the\n          # entire Kubernetes node on which it is scheduled.\n          resources:\n            limits:\n              cpu: \"1\"\n              memory: \"2G\"\n            requests:\n              # For production use-cases, we recommend specifying integer CPU reqests and limits.\n              # We also recommend setting requests equal to limits for both CPU and memory.\n              # For this example, we use a 500m CPU request to accomodate resource-constrained local\n              # Kubernetes testing environments such as KinD and minikube.\n              cpu: \"500m\"\n              # The rest state memory usage of the Ray head node is around 1Gb. We do not\n              # recommend allocating less than 2Gb memory for the Ray head pod.\n              # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.\n              memory: \"2G\"\n        schedulerName: runai-scheduler\n  workerGroupSpecs:\n  # the pod replicas in this group typed worker\n  - replicas: 1 # example\n    minReplicas: 1 # example\n    maxReplicas: 10 # example\n    # logical group name, for this called small-group, also can be functional\n    groupName: small-group\n    # If worker pods need to be added, we can increment the replicas.\n    # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.\n    # The operator will remove pods from the list until the desired number of replicas is satisfied.\n    # If the difference between the current replica count and the desired replicas is greater than the\n    # number of entries in workersToDelete, random worker pods will be deleted.\n    #scaleStrategy:\n    #  workersToDelete:\n    #  - raycluster-complete-worker-small-group-bdtwh\n    #  - raycluster-complete-worker-small-group-hv457\n    #  - raycluster-complete-worker-small-group-k8tj7\n    # the following params are used to complete the ray start: ray start --block ...\n    rayStartParams:\n      block: 'true'\n    #pod template\n    template:\n      spec:\n        initContainers:\n        # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name\n        - name: init\n          image: busybox:1.28\n          command: ['sh', '-c', \"until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done\"]\n        containers:\n        - name: ray-worker\n          image: rayproject/ray:2.3.0\n          lifecycle:\n            preStop:\n              exec:\n                command: [\"/bin/sh\",\"-c\",\"ray stop\"]\n          # The resource requests and limits in this config are too small for production!\n          # For an example with more realistic resource configuration, see\n          # ray-cluster.autoscaler.large.yaml.\n          # It is better to use a few large Ray pod than many small ones.\n          # For production, it is ideal to size each Ray pod to take up the\n          # entire Kubernetes node on which it is scheduled.\n          resources:\n            limits:\n              cpu: \"1\"\n              memory: \"1G\"\n            # For production use-cases, we recommend specifying integer CPU reqests and limits.\n            # We also recommend setting requests equal to limits for both CPU and memory.\n            # For this example, we use a 500m CPU request to accomodate resource-constrained local\n            # Kubernetes testing environments such as KinD and minikube.\n            requests:\n              cpu: \"500m\"\n              memory: \"1G\"\n        schedulerName: runai-scheduler\n</code></pre>"},{"location":"admin/integration/seldon/","title":"Integrate Run:ai with Seldon Core","text":"<p>Seldon Core is software that deploys machine learning models to production over Kubernetes. The purpose of this document is to explain how to use Seldon Core together with Run:ai.  </p> <p>Of special importance, is the usage of Seldon together with the Run:ai fractions technology: Machine learning production tends to take less GPU Memory. As such, allocating a fraction of the GPU per job allows for better GPU Utilization. </p>"},{"location":"admin/integration/seldon/#prerequisites","title":"Prerequisites","text":"<p>Install Seldon Core as described here. We recommend using the helm-based installation of both Seldon Core and Istio.</p>"},{"location":"admin/integration/seldon/#create-a-seldon-deployment","title":"Create a Seldon deployment","text":"<p>The instructions below follow a sample machine learning model that tests the Run:ai - Seldon Core integration.  Save the following in a file named <code>&lt;FILE-NAME&gt;.yaml</code></p> <pre><code>apiVersion: machinelearning.seldon.io/v1\nkind: SeldonDeployment\nmetadata:\nname: seldon-model\nnamespace: runai-&lt;PROJECT-NAME&gt;\nspec:\nname: test-deployment\npredictors:\n- componentSpecs:\n- spec:\ncontainers:\n- name: classifier\nimage: seldonio/mock_classifier:1.5.0-dev\nresources:\nlimits:\nnvidia.com/gpu: &lt;GPUs&gt;\nschedulerName: runai-scheduler\ngraph:\nchildren: []\nendpoint:\ntype: REST\nname: classifier\ntype: MODEL\nname: example\nreplicas: 1\n</code></pre> <p>apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata:   name: seldon-model   namespace: runai- spec:   name: test-deployment   predictors:   - componentSpecs:     - spec:         containers:         - name: classifier           image: seldonio/mock_classifier:1.0           resources:             limits:               nvidia.com/gpu:          schedulerName: runai-scheduler     graph:       children: []       endpoint:         type: REST       name: classifier       type: MODEL     name: example     replicas: 1 <p>Replace <code>&lt;PROJECT-NAME&gt;</code> with the Run:ai projects and <code>&lt;GPUs&gt;</code> with the amount of GPUs you want to allocate (e.g. 0.5 GPUs).</p> <pre><code>kubectl apply -f &lt;FILE-NAME&gt;.yaml\n</code></pre>"},{"location":"admin/integration/seldon/#verification","title":"Verification","text":"<p>Run: <code>runai list jobs</code> and verify that the job is running</p>"},{"location":"admin/integration/seldon/#delete-a-deployment","title":"Delete a deployment","text":"<p>Run: </p> <pre><code>kubectl delete -f &lt;FILE-NAME&gt;.yaml\n</code></pre>"},{"location":"admin/integration/spark/","title":"Running Spark jobs with Run:AI","text":"<p>Spark has two modes for running jobs on kubernetes:</p> <ul> <li>Using a CLI tool called <code>spark-submit</code> that submits raw pods.</li> <li>CRD with operator.</li> </ul>"},{"location":"admin/integration/spark/#cli-spark-submit","title":"CLI <code>Spark-submit</code>","text":"<p>To run a Spark job on Kubernetes using the CLI:</p> <ol> <li>Download a pre-built spark with hadoop image from here.</li> <li>Open the file, then go to its root to submit the jobs.</li> </ol>"},{"location":"admin/integration/spark/#cluster-preparation","title":"Cluster preparation","text":"<p>Ensure that your Kubernetes cluster has a service account with permissions in the namespace that you want to run the jobs in. Use the following commands to launch the Spark demo:</p> <pre><code>kubectl create ns spark-demo\n\nkubectl create serviceaccount spark -n spark-demo\n\nkubectl create clusterrolebinding spark-role --clusterrole edit --serviceaccount spark-demo:spark -n spark-demo\n</code></pre> <p>Change the namespace to <code>runai-&lt;your_runai-project-name&gt;</code>.</p>"},{"location":"admin/integration/spark/#docker-images","title":"Docker Images","text":"<p>We need to build docker images and push them to either a public repository or load them to kind.</p> <p>To build the images run:</p> <pre><code>./bin/docker-image-tool.sh -t &lt;image_tag&gt; build\n</code></pre> <p>Then push the docker image to your repository:</p>"},{"location":"admin/integration/spark/#submitting-jobs","title":"Submitting jobs","text":"<p>To submit a job:</p> <ol> <li>Set the value of the API server of the kubernetes cluster you are working with in the <code>K8S\\_SERVER</code> environment variable. </li> <li>Run <code>kubectl config view</code> to search for your cluster.</li> <li>Copy the value of the server field (for example, https://127.0.0.1:46443).</li> </ol> <p>To run a simple job with the default scheduler use the following:</p> <pre><code>./bin/spark-submit --master k8s://$K8S\\_SERVER --deploy-mode cluster --name spark-pi \\\n--class org.apache.spark.examples.SparkPi \\\n--conf spark.kubernetes.namespace=spark-demo \\\n--conf spark.executor.instances=5 \\\n--conf spark.kubernetes.container.image=spark:v3.2.1 \\\n--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \\\nlocal:///opt/spark/examples/jars/spark-examples\\_2.12-3.4.0.jar 10\n</code></pre> <p>The command will first create a pod called driver\" and then it will create 5 executor (worker) pods that will do the actual work of running the job. The executor pods will have the driver as their Kubernetes owner.</p>"},{"location":"admin/integration/spark/#submitting-jobs-using-runai-scheduler","title":"Submitting jobs using <code>runai-scheduler</code>","text":"<p>To submit a job with <code>runai-scheduler</code> in project <code>&lt;project_name&gt;</code> add or change these flags:</p> <pre><code>--conf spark.kubernetes.namespace=runai-team-a \\\n--conf spark.kubernetes.scheduler.name=runai-scheduler \\\n--conf spark.kubernetes.driver.label.runai/queue=team-a \\\n--conf spark.kubernetes.executor.label.runai/queue=team-a \\\n</code></pre> <p>To schedule the executors on GPUs, add the following flags:</p> <pre><code>--conf spark.executor.resource.gpu.amount=1 \\\n--conf spark.executor.resource.gpu.vendor=nvidia.com \\\n--conf spark.executor.resource.gpu.discoveryScript=/opt/spark/examples/src/main/scripts/getGpusResources.sh \\\n</code></pre>"},{"location":"admin/integration/spark/#see-also","title":"See also","text":"<p>[1] Demo: Running Spark Examples on minikube</p> <p>[2] Running Spark on Kubernetes</p>"},{"location":"admin/integration/weights-and-biases/","title":"Weights and Biases","text":"<p>Weights and Biases is a commercial tool that provides experiment tracking, model visualization, and collaboration for machine learning projects. It helps researchers and developers keep track of their experiments, visualize their results, and compare different models to make informed decisions.</p> <p>When Wights and Biases are integrated into Run:ai Workspaces, researchers can easily create their custom work environments and have access to a toolbox of many researcher-relevant tools in a single place. Researchers can now create useful connectivity between the running Workspace and the relevant project using Weights Biases for experiment tracking.</p> <p>To configure Weights and Biases:</p> <ol> <li>Login to your account in Weights and Biases. If you do not have a valid account, you will need to create one.</li> <li>Setup your Weights and Biases account here</li> <li>In your Run:ai account, create an environment and set Weights and Biases as a tool then:</li> <li>Enter the following <code>&lt;WANDB_results_URL&gt;</code></li> <li>Add an environment variable:<pre><code>```Key = WANDB_results_URL```\n\n```Value = enter the URL destination for the results```\n</code></pre> </li> </ol> <p>The researcher must then create a Workspace and select the Weights and Biases tool.</p> <p>To configure the Weights and Biases tool, for the environemnt variable name <code>WANDB_results_URL</code> value, enter the ULR of the destination where the results are to be delivered.</p> <p>This will create a link, that will automatically open a new tab directly from your Workspace to your exact Weights and Biases project.</p>"},{"location":"admin/integration/weights-and-biases/#sweep-configuration","title":"Sweep configuration","text":"<p>Sweep is Weight &amp; Biases tool for performing hyperparameters search and optimization. For more infomrmation, see Tune Hyperparameters.</p> <p>To enable the WANDB sweep feature:</p> <ol> <li>Open the settings page.</li> <li>Toggle on the wandb sweep feature and enter the base URL, then press save.</li> </ol> <p>To submit a sweep configuration for your Run:ai job:</p> <ol> <li>Open the submit job page, ans select training.</li> <li>Open the WANDB sweep section, and toggle the Sweep configuration switch.</li> <li>Enter your YAML configuration file or paste it into the editor.</li> <li>Enter the Weight &amp; Biases entity name.</li> <li>Enter the Weights &amp; Biases project name (optional).</li> <li>Enter the Weights &amp; Biases API Key.</li> <li>Enter a count (optional).</li> </ol> <p>After you have completed the job setup, press submit.</p> <p>If successful, you should see in the logs that the job is running and a connection to wandb's. Then, go to your wandb web app, open the sweeps page, then open the last sweep created, you should see data there.</p>"},{"location":"admin/researcher-setup/cli-install/","title":"Install the Run:ai Command-line Interface","text":"<p>The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.</p> <p>The instructions below will guide you through the process of installing the CLI. The Run:ai CLI runs on Mac and Linux. You can run the CLI on Windows by using Docker for Windows. See the end of this document.</p>"},{"location":"admin/researcher-setup/cli-install/#researcher-authentication","title":"Researcher Authentication","text":"<p>When enabled, Researcher authentication requires additional setup when installing the CLI. To configure authentication see Setup Project-based Researcher Access Control. Use the modified Kubernetes configuration file described in the article.</p>"},{"location":"admin/researcher-setup/cli-install/#prerequisites","title":"Prerequisites","text":"<ul> <li>When installing the command-line interface, it is worth considering future upgrades:<ul> <li>Install the CLI on a dedicated Jumpbox machine. Researchers will connect to the Jumpbox from which they can submit Run:ai commands</li> <li>Install the CLI on a shared directory that is mounted on Researchers' machines.  </li> </ul> </li> <li>A Kubernetes configuration file. </li> </ul>"},{"location":"admin/researcher-setup/cli-install/#setup","title":"Setup","text":""},{"location":"admin/researcher-setup/cli-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"<ul> <li>On the Researcher's root folder, create a directory .kube. Copy the Kubernetes configuration file into the directory. Each Researcher should have a separate copy of the configuration file. The Researcher should have write access to the configuration file as it stores user defaults. </li> <li>If you choose to locate the file at a different location than <code>~/.kube/config</code>, you must create a shell variable to point to the configuration file as follows:</li> </ul> <pre><code>export KUBECONFIG=&lt;Kubernetes-config-file&gt;\n</code></pre> <ul> <li>Test the connection by running:</li> </ul> <pre><code>kubectl get nodes\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#install-runai-cli","title":"Install Run:ai CLI","text":"<ul> <li>Go to the Run:ai user interface. On the top right select <code>Researcher Command Line Interface</code>.</li> <li>Select <code>Mac</code>, <code>Linux</code> or <code>Windows</code>. </li> <li>Download directly using the button or copy the file to run it on a remote machine</li> </ul> Mac or LinuxWindows <p>Run:</p> <pre><code>chmod +x runai\nsudo mv runai /usr/local/bin/runai\n</code></pre> <p>Rename the downloaded file to have a <code>.exe</code> extension and move the file to a folder that is a part of the <code>PATH</code>.</p> <p>Note</p> <p>An alternative way of downloading the CLI is provided under the CLI Troubleshooting section.</p> <p>To verify the installation run:</p> <pre><code>runai list jobs\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#install-command-auto-completion","title":"Install Command Auto-Completion","text":"<p>It is possible to configure your Linux/Mac shell to complete Run:ai CLI commands. This feature works on bash and zsh shells only.</p>"},{"location":"admin/researcher-setup/cli-install/#zsh","title":"Zsh","text":"<p>Edit the file <code>~/.zshrc</code>. Add the lines:</p> <pre><code>autoload -U compinit; compinit -i\nsource &lt;(runai completion zsh)\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#bash","title":"Bash","text":"<p>Install the bash-completion package:</p> <ul> <li>Mac: <code>brew install bash-completion</code></li> <li>Ubuntu/Debian: <code>sudo apt-get install bash-completion</code></li> <li>Fedora/Centos: <code>sudo yum install bash-completion</code></li> </ul> <p>Edit the file <code>~/.bashrc</code>. Add the lines:</p> <pre><code>[[ -r \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d ]] &amp;&amp; . \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d\nsource &lt;(runai completion bash)\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#troubleshoot-the-cli-installation","title":"Troubleshoot the CLI Installation","text":"<p>See Troubleshooting a CLI installation</p>"},{"location":"admin/researcher-setup/cli-install/#update-the-runai-cli","title":"Update the Run:ai CLI","text":"<p>To update the CLI to the latest version perform the same install process again.</p>"},{"location":"admin/researcher-setup/cli-install/#delete-the-runai-cli","title":"Delete the Run:ai CLI","text":"<p>If you have installed using the default path, run:</p> <pre><code>sudo rm /usr/local/bin/runai\n</code></pre>"},{"location":"admin/researcher-setup/cluster-wide-pvc/","title":"Cluster wide PVCs","text":"<p> Version 2.10 and later.</p> <p>A PersistentVolumeClaim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes.</p> <p>PVCs are namespace-specific. If your PVC relates to all run:ai Projects, do the following to propagate the PVC to all Projects:</p> <p>Create a PVC within the run:ai namespace, then run the following once to propagate the PVC to all run:ai Projects:</p> <pre><code>kubectl label persistentvolumeclaims -n runai &lt;PVC_NAME&gt; runai/cluster-wide=true\n</code></pre> <p>To delete a PVC from all run:ai Projects, run:</p> <pre><code>kubectl label persistentvolumeclaims -n runai &lt;PVC_NAME&gt; runai/cluster-wide-\n</code></pre> <p>You can add a PVC to a job using the <code>New job</code> form.</p> <p>To add a PVC to a new job:</p> <ol> <li>On the <code>New job</code> form, press <code>Storage</code>.</li> <li>In <code>Persistent Volume Claims</code> press <code>Add</code>.</li> <li>Enable <code>Existing PVC</code>.</li> <li>Enter the name (claim name) of the PVC.</li> <li>Enter the storage class. (Optional)</li> <li>Enter the size.</li> <li>Enable / disable access modes.</li> </ol>"},{"location":"admin/researcher-setup/docker-registry-config/","title":"Using a Docker Registry with Credentials","text":""},{"location":"admin/researcher-setup/docker-registry-config/#why","title":"Why?","text":"<p>Some Docker images are stored in private docker registries. For the Researcher to access the images, we will need to provide credentials for the registry.</p>"},{"location":"admin/researcher-setup/docker-registry-config/#how","title":"How?","text":"<p>There could be two business scenarios:</p> <ol> <li>All researchers use single credentials for the registry. </li> <li>There exist separate registry credentials per Run:ai Project. </li> </ol>"},{"location":"admin/researcher-setup/docker-registry-config/#single-credentials","title":"Single Credentials","text":"<p>For each private registry you must perform the following (The example below uses Docker Hub):</p> <pre><code>kubectl create secret docker-registry &lt;secret_name&gt; -n runai \\ \n    --docker-server=https://index.docker.io/v1/ \\\n    --docker-username=&lt;user_name&gt; --docker-password=&lt;password&gt;\n</code></pre> <p>Then:</p> <pre><code>kubectl label secret &lt;secret_name&gt; runai/cluster-wide=\"true\" -n runai\n</code></pre> <ul> <li><code>&lt;secret_name&gt;</code> may be any arbitrary string</li> <li><code>&lt;user_name&gt;</code> and <code>&lt;password&gt;</code> are the repository user and password</li> </ul> <p>Notes<ul> <li>The secret may take up to a minute to update in the system.</li> <li>The above scheme relies on the cluster setting <code>clusterWideSecret</code> to be set to <code>true</code></li> </ul> </p>"},{"location":"admin/researcher-setup/docker-registry-config/#credentials-per-project","title":"Credentials per Project","text":"<p>For each Run:ai Project create a secret:</p> <pre><code>kubectl create secret docker-registry &lt;secret_name&gt; -n &lt;NAMESPACE&gt; \\ \n    --docker-server=https://index.docker.io/v1/ \\\n    --docker-username=&lt;user_name&gt; --docker-password=&lt;password&gt;\n</code></pre> <p>Where <code>&lt;NAMESPACE&gt;</code> is the namespace associated with the Project (typically its <code>runai-&lt;PROJECT-NAME&gt;</code>).</p> <p>Then apply the secret to Run:ai by running:</p> <pre><code>kubectl patch serviceaccount default -n &lt;NAMESPACE&gt; -p '{\"imagePullSecrets\": [{\"name\": \"&lt;secret_name&gt;\"}]}'\n</code></pre>"},{"location":"admin/researcher-setup/docker-registry-config/#google-cloud-registry","title":"Google Cloud Registry","text":"<p>Follow the steps below to access private images in the Google Container Registry (GCR):</p> <ul> <li>Create a service-account in GCP. Provide it <code>Viewer</code> permissions and download a JSON key.</li> <li>Under GCR, go to image and locate the domain name. Example GCR domains can be <code>gcr.io</code>, <code>eu.gcr.io</code> etc. </li> <li> <p>On your local machine, log in to docker with the new credentials: <pre><code>docker login -u _json_key -p \"$(cat &lt;config.json&gt;)\" &lt;gcr-domain&gt;\n</code></pre>  Where <code>&lt;gcr-domain&gt;</code> is the GCR domain we have located, <code>&lt;config.json&gt;</code> is the GCP configuration file. This will generate an entry for the GCR domain in your  <code>~/.docker/config.json file</code>.</p> </li> <li> <p>Open the <code>~/.docker/config.json</code> file.  Copy the JSON structure under the GCR domain into a new file called <code>~/docker-config.json</code>. When doing so, take care to remove all newlines. For example: <pre><code>{\"https://eu.gcr.io\": { \"auth\": \"&lt;key&gt;\"}}\n</code></pre></p> </li> <li> <p>Convert the file into base64: <pre><code>cat ~/docker-config.json | base64\n</code></pre></p> </li> <li>Create a new file called <code>secret.yaml</code>:</li> </ul> <pre><code>apiVersion: v1\nkind: Secret\nmetadata:\nname: gcr-secret\nnamespace: runai\nlabels:\nrunai/cluster-wide: \"true\"\ndata:\n.dockerconfigjson: &lt;&lt; PASTE_HERE_THE_LONG_BASE64_ENCODED_STRING &gt;&gt;\ntype: kubernetes.io/dockerconfigjson\n</code></pre> <ul> <li>Apply to Kubernetes by running  the command: <pre><code>kubectl create -f ~/secret.yaml\n</code></pre></li> <li>Test your settings by submitting a which references an image from the GCR repository</li> </ul>"},{"location":"admin/researcher-setup/docker-to-runai/","title":"From Docker to Run:ai","text":""},{"location":"admin/researcher-setup/docker-to-runai/#dockers-images-and-kubernetes","title":"Dockers, Images, and Kubernetes","text":"<p>Researchers are typically proficient in working with Docker. Docker is an isolation level above the operating system which allows creating your own bundle of the operating system + deep learning environment and packaging it within a single file. The file is called a docker image.</p> <p>You create a container by starting a docker image on a machine.</p> <p>Run:ai is based on Kubernetes. At its core, Kubernetes is an orchestration software above Docker: Among other things, it allows location abstraction as to where the actual container is running. This calls for some adaptation to the Researcher's workflow as follows.</p>"},{"location":"admin/researcher-setup/docker-to-runai/#image-repository","title":"Image Repository","text":"<p>If your Kubernetes cluster contains a single GPU node (machine), then your image can reside on the node itself (in which case, when runai submit workloads, the Researcher must use the flag <code>--local-image</code>).</p> <p>If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the image can no longer reside on the node itself.  It must be relocated to an image repository. There are quite a few repository-as-a-service, most notably Docker hub. Alternatively, the organization can install a private repository on-prem.</p> <p>Day-to-day work with the image located remotely is almost identical to local work. The image name now contains its location. For example, <code>nvcr.io/nvidia/pytorch:19.12-py_3</code> is a PyTorch image that is located in nvcr.io. This is the Nvidia image repository as found on the web. </p>"},{"location":"admin/researcher-setup/docker-to-runai/#data","title":"Data","text":"<p>Deep learning is about data. It can be your code, the training data, saved checkpoints, etc.</p> <p>If your Kubernetes cluster contains a single GPU node (machine), then your data can reside on the node itself.</p> <p>If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the data must sit outside the machine, typically on network storage. The storage must be uniformly mapped to your container when it starts (using the -v command).</p>"},{"location":"admin/researcher-setup/docker-to-runai/#working-with-containers","title":"Working with Containers","text":"<p>Starting a container using docker usually involves a single command-line with multiple flags. A typical example: </p> <pre><code>docker run --runtime=nvidia --shm-size 16G -it --rm -e HOSTNAME='hostname' \\\n    -v /raid/public/my_datasets:/root/dataset:ro   -i  nvcr.io/nvidia/pytorch:19.12-py3\n</code></pre> <p>The docker command <code>docker run</code> should be replaced with a Run:ai command <code>runai submit</code>. The flags are usually the same but some adaptation is required. A complete list of flags can be found here: runai submit. </p> <p>There are similar commands to get a shell into the container (runai bash), get the container logs (runai logs), and more. For a complete list see the Run:ai CLI reference. </p>"},{"location":"admin/researcher-setup/docker-to-runai/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"<p>It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline  Researchers' work as well as save money for the organization.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/","title":"Group Nodes","text":""},{"location":"admin/researcher-setup/limit-to-node-group/#why","title":"Why?","text":"<p>In some business scenarios, you may want to direct the Run:ai scheduler to schedule a Workload to a specific node or a node group. For example, in some academic institutions, Hardware is bought using a specific grant and thus \"belongs\" to a specific research group. Another example is an inference workload that is optimized to a specific GPU type and must have dedicated resources reserved to ensure enough capacity.</p> <p>Run:ai provides two methods to designate, and group, specific resources:</p> <ul> <li>Node Pools: Run:ai allows administrators to group specific nodes into a node pool. A node pool is a group of nodes identified by a given name (node pool name) and grouped by any label (key and value combination). The label can be chosen by the administrator or can be an existing, pre-set, label (such as an NVIDIA GPU type label).</li> <li>Node Affinity: Run:ai allows this \"taint\" by labeling a node, or a set of nodes and then during scheduling, using the flag <code>--node-type &lt;label&gt;</code> to force this allocation.</li> </ul> <p>Important</p> <p>One can set and use both node pool and node affinity combined as a prerequisite to the scheduler, for example, if a researcher wants to use a T4 node with an Infiniband card - he or she can use a node pool of T4 and from that group, choose only the nodes with Infiniband card (node-type = infiniband).</p> <p>There is a tradeoff in place when allowing Researchers to designate specific nodes. Overuse of this feature limits the scheduler in finding an optimal resource and thus reduces overall cluster utilization.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#configuring-node-groups","title":"Configuring Node Groups","text":"<p>To configure a node pool:</p> <ul> <li>Find the label key &amp; value you want to use for Run:ai to create the node pool.</li> <li>Check that the nodes you want to group as a pool have a unique label to use, otherwise you should mark those nodes with your own uniquely identifiable label.</li> <li>Get the names of the nodes you want Run:ai to group together. To get a list of nodes, run:</li> </ul> <pre><code>kubectl get nodes\nKubectl get nodes --show-labels\n</code></pre> <ul> <li>If you chose to set your own label, run the following:</li> </ul> <pre><code>kubectl label node &lt;node-name&gt; &lt;label-key&gt;=&lt;label-value&gt;\n</code></pre> <p>The same value can be set to a single node or multiple nodes. Node Pool can only use one label (key &amp; value) at a time.</p> <ul> <li>To create a node pool use the create node pool Run:ai API.</li> </ul> <p>To configure a node affinity:</p> <ul> <li>Get the names of the nodes where you want to limit Run:ai. To get a list of nodes, run:</li> </ul> <pre><code>kubectl get nodes\n</code></pre> <ul> <li>For each node run the following:</li> </ul> <pre><code>kubectl label node &lt;node-name&gt; run.ai/type=&lt;label&gt;\n</code></pre> <p>The same value can be set to a single node, or for multiple nodes. A node can only be set with a single value.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#using-node-groups-via-the-cli","title":"Using Node Groups via the CLI","text":"<p>To use Run:ai node pool with a workload, use Run:ai CLI command \u2018node-pool\u2019: </p> <pre><code>runai submit job1 ... --node-pools \"my-pool\" ...\n</code></pre> <p>To use multiple node pools with a workload, use the Run:ai CLI command:</p> <pre><code>runai submit job1 ... --node-pools \"my-pool my-pool2 my-pool3\" ...\n</code></pre> <p>With multiple node pools, the researcher creates a list of prioritized node pools and lets the scheduler try and choose from any of the node pools in the list, according to the given priority. </p> <p>To use node affinity, use the node type label with the <code>--node-type</code> flag:</p> <pre><code>runai submit job1 ... --node-type \"my-nodes\"\n</code></pre> <p>A researcher may combine the two flags to select both a node pool and a specific set of nodes out of that node pool (e.g. gpu-type=t4 and node-type=infiniband):</p> <pre><code>runai submit job1 ... --node-pool-name \u201cmy pool\u201d --node-type \"my-nodes\"\n</code></pre> <p>Note</p> <p>When submitting a workload, if you choose a node pool label and a node affinity (node type) label which does not intersect, the Run:ai scheduler will not be able to schedule that workload as it represents an empty nodes group.</p> <p>See the runai submit documentation for further information.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#assigning-node-groups-to-a-project","title":"Assigning Node Groups to a Project","text":"<p>Node Pools are automatically assigned to all Projects and Departments with zero resource allocation as default. Allocating resources to a node pool can be done for each Project and Department. Submitting a workload to a node pool that has zero allocation for a specific project (or department) results in that workload running as an over-quota workload.</p> <p>To assign and configure specific node affinity groups or node pools to a Project see working with Projects.</p> <p>When the command-line interface flag is used in conjunction with Project-based affinity, the flag is used to refine the list of allowable node groups set in the Project.</p>"},{"location":"admin/researcher-setup/registry-integration/","title":"Registry integration (alpha feature)","text":"<p>run:ai now provides the ability to integrate container registry images into jobs and workspaces. Enabling this features provides you with a selection of container images from a pre-configured registry.</p>"},{"location":"admin/researcher-setup/registry-integration/#configure-the-registry","title":"Configure the registry","text":"<p>To configure the registry:</p> <ol> <li>Press <code>Settings | General</code>.</li> <li>Enable <code>Enable registry integration</code>.</li> <li>Enter the URL of the registry.</li> <li>Enter the UserId and password.</li> </ol> <p>Note</p> <p>You can configure only one container registry.</p> <p>Once you have configured the container registry, container images and tags will be available to add to jobs.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/","title":"Researcher Setup Overview","text":"<p>Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#change-of-paradigms-from-docker-to-kubernetes","title":"Change of Paradigms: from Docker to Kubernetes","text":"<p>As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#setup-the-runai-command-line-interface","title":"Setup the Run:ai Command-Line Interface","text":"<p>Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-the-researcher-with-a-gpu-quota","title":"Provide the Researcher with a GPU Quota","text":"<p>To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-access-to-the-runai-user-interface","title":"Provide access to the Run:ai User Interface","text":"<p>See Setting up users for further information on how to provide access to users.  </p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"<p>It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization. </p>"},{"location":"admin/runai-setup/installation-types/","title":"Installation Types","text":"<p>Run:ai consists of two components:</p> <ul> <li>The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).</li> <li>The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies. </li> </ul> <p>There are two main installation options:</p> Installation Type Description Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<code>&lt;tenant-name&gt;</code>.run.ai).  With this installation, the cluster requires an outbound connection to the Run:ai cloud. Self-hosted The Run:ai control plane is also installed in the customer's data center <p>The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales. </p>"},{"location":"admin/runai-setup/installation-types/#self-hosted-installation","title":"Self-hosted Installation","text":"<p>Run:ai self-hosting comes with two variants:</p> Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet"},{"location":"admin/runai-setup/installation-types/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"<p>Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes prerequisites section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:</p> <ul> <li>OpenShift-based installation. See Run:ai OpenShift installation.</li> <li>Kubernetes-based installation. See Run:ai Kubernetes installation.</li> </ul>"},{"location":"admin/runai-setup/installation-types/#secure-installation","title":"Secure Installation","text":"<p>In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:</p> <ul> <li>OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.</li> <li>Kubernetes Pod Security Policy (PSP) has been deprecated by Kubernetes. Support for PSP will be removed on Run:ai versions higher than 2.8.</li> </ul>"},{"location":"admin/runai-setup/try-azure/","title":"Try Run:ai on Azure Cloud","text":"<p>You can try Run:ai by starting a virtual machine on Azure. This option is currently limited to a single GPU node. To install a cluster with multiple nodes or for running a formal pilot with Run:ai, use Cluster Installation.</p>"},{"location":"admin/runai-setup/try-azure/#prerequisites","title":"Prerequisites","text":"<p>You will need:</p> <ul> <li>An account in Azure with a quota for GPUs. Run:ai will work with any modern GPU.</li> <li>Tenant credentials and data, provided by Run:ai customer support. </li> </ul>"},{"location":"admin/runai-setup/try-azure/#create-an-instance-in-azure","title":"Create an instance in Azure","text":"<ul> <li>Go to Run:ai Quickstart in the Azure marketplace.  </li> <li>Press the \"Create\" button. </li> <li>Select a name, subscription, and machine size with GPUs. The machine should have at least 8 CPUs. </li> <li>Under the <code>Advanced</code> tab select <code>Enable user data</code>. Paste the user data provided by Run:ai customer support. It should be in the format: <pre><code>export RUNAI_TENANT=&lt;tenant-name&gt;\nexport RUNAI_CLIENTID=&lt;client-id&gt;\nexport RUNAI_SECRET=&lt;secret&gt;\n</code></pre></li> <li>Create the machine.</li> </ul>"},{"location":"admin/runai-setup/try-azure/#use-runai","title":"Use Run:ai","text":"<p>Go to <code>https://&lt;tenant-name&gt;.run.ai</code>. Use credentials provided by Run:ai support.</p> <p>After ~30 minutes you should have a working Run:ai cluster. You can submit Jobs via the user interface. Command-line is not provided.  </p>"},{"location":"admin/runai-setup/try-azure/#limitations","title":"Limitations","text":"<p>This setup does not support single-sign-on.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/","title":"Overview","text":""},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-overview","title":"Authentication Overview","text":"<p>To access Run:ai resources, you have to authenticate. The purpose of this document is to explain how authentication works at Run:ai.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-endpoints","title":"Authentication Endpoints","text":"<p>Generally speaking, there are two authentication endpoints:</p> <ul> <li>The Run:ai control plane.</li> <li>Run:ai GPU clusters.</li> </ul> <p>Both endpoints are accessible via APIs as well as a user interface. </p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#identity-service","title":"Identity Service","text":"<p>Run:ai includes an internal identity service. The identity service ensures users are who they claim to be and gives them the right kind of access to Run:ai.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#users","title":"Users","text":"<p>Out of the box, The Run:ai identity service provides a way to create users and associate them with access roles. </p> <p>It is also possible to configure the Run:ai identity service to connect to a company directory using the SAML protocol. For more information see single sign-on.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-method","title":"Authentication Method","text":"<p>Both endpoints described above are protected via time-limited oauth2-like JWT authentication tokens.</p> <p>There are two ways of getting a token:</p> <ul> <li>Using a user/password combination.</li> <li>Using client applications for API access.</li> </ul>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-flows","title":"Authentication Flows","text":""},{"location":"admin/runai-setup/authentication/authentication-overview/#runai-control-plane","title":"Run:ai control plane","text":"<p>You can use the Run:ai user interface to provide user/password. These are validated against the identity service. Run:ai will return a token with the right access rights for continued operation. </p> <p>You can also use a client application to get a token and then connect directly to the administration API endpoint. </p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#runai-gpu-clusters","title":"Run:ai GPU Clusters","text":"<p>The Run:ai GPU cluster is a Kubernetes cluster. All communication into Kubernetes flows through the Kubernetes API server.</p> <p>To facilitate authentication via Run:ai the Kubernetes API server must be configured to use the Run:ai identity service to validate authentication tokens. For more information on how to configure the Kubernetes API server see Kubernetes configuration under researcher authentication.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#inactivity-timeout","title":"Inactivity timeout","text":"<p> Version 2.10 and later.</p> <p>Run:ai session should timeout after 1 hour of inactivity.</p> <p>Note</p> <p>Timeout settings are configured in minutes.</p> <p>To configure the inactivity timeout: 1. Open <code>Settings | General</code>. 2. Set the inactivity timeout in minutes. (Default is 60)</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#see-also","title":"See also","text":"<ul> <li>To configure authentication for researchers researcher authentication.</li> <li>To configure single sign-on, see single sign-on.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/","title":"Setup Researcher Access Control","text":""},{"location":"admin/runai-setup/authentication/researcher-authentication/#introduction","title":"Introduction","text":"<p>The following instructions explain how to complete the configuration of access control for Researchers. Run:ai access control is at the Project level. When you assign Users to Projects - only these users are allowed to submit Jobs and access Jobs details. </p> <p>This requires several steps:</p> <ul> <li>Assign users to their Projects.</li> <li>(Mandatory) Modify the Kubernetes entry point (called the <code>Kubernetes API server</code>) to validate credentials of incoming requests against the Run:ai Authentication authority.</li> <li>(Command-line Interface usage only) Modify the Kubernetes profile to prompt the Researcher for credentials when running <code>runai login</code> (or <code>oc login</code> for OpenShift). </li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#administration-user-interface-setup","title":"Administration User Interface Setup","text":""},{"location":"admin/runai-setup/authentication/researcher-authentication/#assign-users-to-projects","title":"Assign Users to Projects","text":"<p>Assign Researchers to Projects:</p> <ul> <li>Open the Run:ai user interface and navigate to <code>Users</code>. Add a Researcher and assign it a <code>Researcher</code> role.</li> <li>Navigate to <code>Projects</code>. Edit or create a Project. Use the <code>Access Control</code> tab to assign the Researcher to the Project. </li> <li>If you are using Single Sign-On, you can also assign Groups. For more information see the Single Sign-On documentation.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#mandatory-kubernetes-configuration","title":"(Mandatory) Kubernetes Configuration","text":"<p>As described in authentication overview, you must direct the Kubernetes API server to authenticate via Run:ai. This requires adding flags to the Kubernetes API Server. The flags show in the Run:ai user interface under <code>Settings</code> | <code>General</code> | <code>server configuration</code>.</p> <p>Modifying the API Server configuration differs between Kubernetes distributions:</p> Native KubernetesOpenShiftRKERKE2GKEEKSBrightAKSOther <ul> <li>Locate the Kubernetes API Server configuration file. The file's location may differ between different Kubernetes distributions. The location for vanilla Kubernetes is <code>/etc/kubernetes/manifests/kube-apiserver.yaml</code></li> <li>Edit the document, under the <code>command</code> tag, add the server configuration text from <code>Settings | General | Researcher Authentication</code>.   </li> <li>Verify that the <code>kube-apiserver-&lt;master-node-name&gt;</code> pod in the <code>kube-system</code> namespace has been restarted and that changes have been incorporated. Run the below and verify that the oidc flags you have added:</li> </ul> <pre><code>kubectl get pods -n kube-system kube-apiserver-&lt;master-node-name&gt; -o yaml\n</code></pre> <p>No configuration is needed. Instead, Run:ai assumes that an Identity Provider has been defined at the OpenShift level and that the Run:ai Cluster installation has set the <code>OpenshiftIdp</code> flag to true. For more information see the Run:ai OpenShift control-plane setup.</p> <p>Edit Rancher <code>cluster.yml</code> (with Rancher UI, follow this). Add the following:</p> cluster.yml<pre><code>kube-api:\nalways_pull_images: false\nextra_args:\noidc-client-id: runai  # (1)\noidc-issuer-url: https://example.com/auth\noidc-username-prefix: \"-\"\n</code></pre> <ol> <li>These are example parameters. Copy the actual parameters from <code>Settings | General | Researcher Authentication</code> as described above.</li> </ol> <p>You can verify that the flags have been incorporated into the RKE cluster by following the instructions here and running <code>docker inspect &lt;kube-api-server-container-id&gt;</code>, where <code>&lt;kube-api-server-container-id&gt;</code> is the container ID of api-server via obtained in the Rancher document. </p> <p>If working via the RKE2 Quickstart, edit <code>/etc/rancher/rke2/config.yaml</code>. Add the parameters provided in the server configuration section as described above in the following fashion:</p> /etc/rancher/rke2/config.yaml<pre><code>kube-apiserver-arg:\n- \"oidc-client-id=&lt;CLIENT-ID&gt;\"\n- \"oidc-issuer-url=&lt;URL&gt;\"\n- \"oidc-username-prefix=-\"\n</code></pre> <p>If working via Rancher UI, need to add the flag as part of the cluster provisioning. </p> <p>Under <code>Cluster Management | Create</code>, turn on RKE2 and select a platform. Under <code>Cluster Configuration | Advanced | Additional API Server Args</code>. Add the Run:ai flags as <code>&lt;key&gt;=&lt;value&gt;</code> (e.g. <code>oidc-username-prefix=-</code>).</p> <p>At the time of writing, the flags cannot be changed after the cluster has been provisioned due to a Rancher bug.</p> <p>Install Anthos identity service by running:</p> <pre><code>gcloud container clusters update &lt;gke-cluster-name&gt; \\\n    --enable-identity-service --project=&lt;gcp-project-name&gt; --zone=&lt;gcp-zone-name&gt;\n</code></pre> <p>Install the yq utility and run:</p> <pre><code>kubectl get clientconfig default -n kube-public -o yaml &gt; login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"$OIDC_CLIENT_ID\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"sub\\\",\\\"userPrefix\\\":\\\"$OIDC_USERNAME_PREFIX\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n</code></pre> <p>Where the <code>OIDC</code> flags are provided in the Run:ai server configuration section as described above. </p> <p>To create a kubeconfig profile for Researchers run:</p> <pre><code>kubectl oidc login --cluster=CLUSTER_NAME --login-config=login-config.yaml \\\n    --kubeconfig=developer-kubeconfig\n</code></pre> <p>(this will require installing the kubectl oidc plug-in as described in the Anthos document above <code>gcloud components install kubectl-oidc</code>)</p> <p>Then modify the <code>developer-kubeconfig</code> file as described in the Command-line Inteface Access section below.</p> <ul> <li>In the AWS Console, under EKS, find your cluster.</li> <li>Go to <code>Configuration</code> and then to <code>Authentication</code>.</li> <li>Associate a new <code>identity provider</code>. Use the parameters provided in the server configuration section as described above. The process can take up to 30 minutes. </li> </ul> <p>Run the following. Replace <code>&lt;TENANT-NAME&gt;</code> and <code>&lt;REALM-NAME&gt;</code> with the appropriate values:</p> <pre><code># start cmsh\n[root@headnode ~]# cmsh\n# go to the configurationoverlay submode\n[headnode]% configurationoverlay\n\n[headnode-&gt;configurationoverlay]% list  # use list here to list overlays\n...\n\n# go to the overlay for kube master nodes\n[headnode-&gt;configurationoverlay]% use kube-default-master\n\n[headnode-&gt;configurationoverlay[kube-default-master]]% show  # use show here to show the selected overlay\n...\n\n# go to the kube apiserver role\n[headnode-&gt;configurationoverlay[kube-default-master]]% roles\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles]% list   # ... \n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles]% use kubernetes::apiserver\n\n# we can check the current value of \"options\"\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% show  # ...\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% get options\n--anonymous-auth=false\n--service-account-issuer=https://kubernetes.default.svc.cluster.local\n--service-account-signing-key-file=/cm/local/apps/kubernetes/var/etc/sa-default.key\n--feature-gates=LegacyServiceAccountTokenNoAutoGeneration=false\n# we can append our flags like this\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% append options \"--oidc-client-id=runai\"\n[headnode-&gt;configurationoverlay*[kube-default-master*]-&gt;roles*[Kubernetes::ApiServer*]]% append options \"--oidc-issuer-url=https://app.run.ai/auth/realms/&lt;REALM-NAME&gt;\"\n[headnode-&gt;configurationoverlay*[kube-default-master*]-&gt;roles*[Kubernetes::ApiServer*]]% append options \"--oidc-username-prefix=-\"\n# commit the changes\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% ]]% commit\n\n# view updated list of options\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% get options\n--anonymous-auth=false\n--service-account-issuer=https://kubernetes.default.svc.cluster.local\n--service-account-signing-key-file=/cm/local/apps/kubernetes/var/etc/sa-default.key\n--feature-gates=LegacyServiceAccountTokenNoAutoGeneration=false\n--cors-allowed-origins=[\\\"https://&lt;TENANT-NAME&gt;.run.ai\\\"]\n--oidc-client-id=runai\n--oidc-issuer-url=https://app.run.ai/auth/realms/&lt;REALM-NAME&gt;\n--oidc-username-prefix=-\n</code></pre> <p>All nodes with the <code>kube api server</code> role will automatically restart with the new flag.</p> <p>Please contact Run:ai customer support.</p> <p>See specific instructions in the documentation of the Kubernetes distribution.  </p>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#command-line-interface-access","title":"Command-line Interface Access","text":"<p>To control access to Run:ai (and Kubernetes) resources, you must modify the Kubernetes configuration file. The file is distributed to users as part of the Command-line interface installation. </p> <p>When making changes to the file, keep a copy of the original file to be used for cluster administration. After making the modifications, distribute the modified file to Researchers. </p> <ul> <li>Under the <code>~/.kube</code> directory edit the <code>config</code> file, remove the administrative user, and replace it with text from <code>Settings | General | Researcher Authentication</code> | <code>Client Configuration</code>. </li> <li>Under <code>contexts | context | user</code> change the user to <code>runai-authenticated-user</code>.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#test-via-command-line-interface","title":"Test via Command-line interface","text":"<ul> <li>Run: <code>runai login</code> (in OpenShift environments use <code>oc login</code> rather than <code>runai login</code>).</li> <li>You will be prompted for a username and password. In a single sign-on flow, you will be asked to copy a link to a browser, log in and return a code. </li> <li>Once login is successful, submit a Job.</li> <li>If the Job was submitted with a Project to which you have no access, your access will be denied. </li> <li>If the Job was submitted with a Project to which you have access, your access will be granted.</li> </ul> <p>You can also submit a Job from the Run:ai User interface and verify that the new job shows on the job list with your user name. </p>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#test-via-user-interface","title":"Test via User Interface","text":"<ul> <li>Open the Run:ai user interface, go to <code>Jobs</code>.</li> <li>On the top-right, select <code>Submit Job</code>. </li> </ul> <p>Tip</p> <p>If you do not see the button or it is disabled, then you either do not have <code>Researcher</code> access or the cluster has not been set up correctly. For more information, refer to user interface overview.</p>"},{"location":"admin/runai-setup/authentication/sso/","title":"Single Sign-On","text":"<p>Single Sign-On (SSO) is an authentication scheme that allows a user to log in with a single ID to other, independent, software systems. SSO solves security issues involving multiple user/password data entries, multiple compliance schemes, etc.</p> <p>Run:ai supports SSO using the SAML 2.0 protocol and Open ID Connect (OIDC).</p> <p>Caution</p> <p>Single sign-on is only available with SaaS installations where the tenant has been created post-January 2022 or any Self-hosted installation of release 2.0.58 or later. If you are using single sign-on with older versions of Run:ai, please contact Run:ai customer support</p>"},{"location":"admin/runai-setup/authentication/sso/#terminology","title":"Terminology","text":"<p>Identity Provider (Idp)\u2014 a system that creates, maintains, and manages identity information. Example IdPs: Google, Keycloak, Salesforce, Auth0.</p>"},{"location":"admin/runai-setup/authentication/sso/#saml-prerequisites","title":"SAML Prerequisites","text":"<ul> <li>XML Metadata\u2014you must have an XML Metadata file retrieved from your IdP. Upload the file to a web server such that you will have a URL to the file. The URL must have the XML file extension. For example, to connect using Google, you must create a custom SAML App here, download the Metadata file, and upload it to a web server.</li> <li>Organization Name\u2014you must have a Run:ai Organization Name. This is the name that appears on the top right of the Run:ai user interface.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#oidc-prerequisites","title":"OIDC Prerequisites","text":"<ul> <li>Discovery URL\u2014the OpenID server where the content discovery information is published.</li> <li>ClientID\u2014the ID used to identify the client with the Authorization Server.</li> <li>Client Secret\u2014a secret password that only the Client and Authorization Server know.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#additional-attribute-mappings","title":"Additional attribute mappings","text":"<p>You can configure your IdP to map several IdP attributes:</p> IdP attribute Run:ai required name Description User email email (Mandatory) <code>e-mail</code> is the user identifier with Run:ai. User role groups GROUPS (Optional) If exists, allows assigning Run:ai role groups via the IdP. The IdP attribute must be of a type of list of strings. See more below Linux User ID UID (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the Linux User <code>UID</code>. Used to map access to network resources such as file systems to users. The IdP attribute must be of integer type. Linux Group ID GID (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the Linux Group <code>GID</code>. The IdP attribute must be of integer type. Linux Supplementary Groups SUPPLEMENTARYGROUPS (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the relevant Linux supplementary groups. The IdP attribute must be of a type of list of integers. User first name firstName (configurable) (Optional) Used as the first name showing in the Run:ai user interface. User last name lastName (configurable) (Optional) Used as the last name showing in the Run:ai user interface"},{"location":"admin/runai-setup/authentication/sso/#example-attribute-mapping-for-google-suite","title":"Example attribute mapping for Google Suite","text":"<p>If you are using Google Suite as your Identity provider, to map custom attributes follow the Google support article. Use the Whole Number attribute type. For Supplementary Groups use the Multi-value designation.</p>"},{"location":"admin/runai-setup/authentication/sso/#step-1-ui-configuration","title":"Step 1: UI Configuration","text":"<ol> <li>Open the Administration User interface.</li> <li>Go to <code>Settings | General</code>.</li> <li>Turn on <code>Login with SSO</code>.</li> <li>Enter the administrator email.</li> <li>Select the SSO protocol. Choose <code>Saml 2</code> or <code>Open ID Connect</code>.</li> </ol> <p>Note</p> <p>Use your SAML response file to fill in the fields below.</p> <p>For <code>Saml 2</code>:</p> <ol> <li>In the <code>Metadata XML Url</code> field, enter the URL to the XML Metadata file.</li> <li>In the <code>GID</code> field, enter the GID.</li> <li>In the <code>GROUPS</code> field, enter the groups.</li> <li>In the <code>SUPPLEMENTARYGROUPS</code> field, enter the supplementary groups.</li> <li>In the <code>UID</code> field, enter the UID.</li> <li>In the <code>Logout uri</code> field, enter the desired URL logout page. If left empty, you will be redirected to the Run:ai portal.</li> <li>Press <code>Save</code>.</li> </ol> <p>For <code>Open ID Connect</code>:</p> <p> Version 2.10 and later.</p> <ol> <li>In the <code>Discovery Document URL</code> field, enter the URL to the discovery document.</li> <li>In the <code>Client ID</code> field, enter the client ID.</li> <li>In the <code>Client Secret</code> field, enter the client secret.</li> <li>In the <code>GID</code> field, enter the GID.</li> <li>In the <code>GROUPS</code> field, enter the groups.</li> <li>In the <code>SUPPLEMENTARYGROUPS</code> field, enter the supplementary groups.</li> <li>In the <code>UID</code> field, enter the UID.</li> <li>In the <code>Logout uri</code> field, enter the desired URL logout page. If left empty, you will be redirected to the Run:ai portal.</li> <li>Press <code>Save</code>.</li> </ol> <p>Once you press <code>Save</code> you will receive a <code>Redirect URI</code> and an <code>Entity ID</code>. Both values must be set on the IdP side.</p> <p>Important</p> <p>Upon pressing <code>Save</code>, all existing users will be rendered non-functional, and the only valid user will be the Administrator email entered above. You can always revert by disabling Login via SSO.</p>"},{"location":"admin/runai-setup/authentication/sso/#test","title":"Test","text":"<p>Test Connectivity to Administration User Interface:</p> <ul> <li>Using an incognito browser tab and open the Run:ai user interface.</li> <li>Select the <code>Login with SSO</code> button.</li> <li>Provide the <code>Organization name</code> obtained above.</li> <li>You will be redirected to the IdP login page. Use the previously entered Administrator email* to log in.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting","title":"Troubleshooting","text":"<p>The SSO log in can be separated into two parts:</p> <ol> <li>Run:ai redirects to the IdP (for example, Google) for login using a SAML Request.</li> <li>Upon successful login, IdP redirects back to Run:ai with a SAML Response.</li> </ol> <p>You can follow that by following the URL changes from app.run.ai to the IdP provider (for example, accounts.google.com) and back to app.run.ai:</p> <ul> <li>If there is an issue on the IdP site (for example, <code>app_is_not_configred</code> error in Google), the problem is likely to be in the SAML Request.</li> <li>If the user is redirected back to Run:ai and something goes wrong, the problem is most likely in the SAML Response.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting-saml-request","title":"Troubleshooting SAML Request","text":"<ul> <li>When logging in, have the Chrome network inspector open (Open by <code>Right-Click | Inspect</code> on the page, then open the network tab).</li> <li>After the IdP login screen shows, search in the network tab for an HTTP request showing the SAML Request. Depending on the IdP this would be a request to the IdP domain name. For example, accounts.google.com/idp?1234.</li> <li>When found, go to the \"Payload\" tab and copy the value of the SAML Request.</li> <li>Paste the value into a SAML decoder. A typical response should look like this:</li> </ul> <pre><code>&lt;?xml version=\"1.0\"?&gt;\n&lt;samlp:AuthnRequest xmlns:samlp=\"urn:oasis:names:tc:SAML:2.0:protocol\" xmlns=\"urn:oasis:names:tc:SAML:2.0:assertion\" xmlns:saml=\"urn:oasis:names:tc:SAML:2.0:assertion\" AssertionConsumerServiceURL=\"https://.../auth/realms/runai/broker/saml/endpoint\" Destination=\"https://accounts.google.com/o/saml2/idp?idpid=....\" ForceAuthn=\"false\" ID=\"ID_66da617d-b862-4cca-9ei5-b727a920f3cb\" IssueInstant=\"2022-01-12T12:54:22.907Z\" ProtocolBinding=\"urn:oasis:names:tc:SAML:2.0:bindings:HTTP-POST\" Version=\"2.0\"&gt;\n&lt;saml:Issuer&gt;runai-jtqee5v8ob&lt;/saml:Issuer&gt;\n&lt;samlp:NameIDPolicy AllowCreate=\"true\" Format=\"urn:oasis:names:tc:SAML:2.0:nameid-format:persistent\"/&gt;\n&lt;/samlp:AuthnRequest&gt;\n</code></pre> <p>Check in the above that:</p> <ul> <li>The content of the <code>&lt;saml:Issuer&gt;</code> tag is the same as <code>Entity ID</code> defined above.</li> <li><code>AssertionConsumerServiceURL</code> is the same as the <code>Redirect URI</code>.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting-saml-response","title":"Troubleshooting SAML Response","text":"<ul> <li>When logging in, have the Chrome network inspector open (Open by <code>Right-Click | Inspect</code> on the page, then open the network tab).</li> <li>Search for \"endpoint\".</li> <li>When found, go to the \"Payload\" tab and copy the value of the SAML Response.</li> <li>Paste the value into a SAML decoder. A typical response should look like this:</li> </ul> <pre><code>&lt;?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?&gt;\n&lt;saml2p:Response\nxmlns:saml2p=\"urn:oasis:names:tc:SAML:2.0:protocol\" Destination=\"https://.../auth/realms/runai/broker/saml/endpoint\" ID=\"_2d085ed4f45a7ab221a49e6c02e30cac\" InResponseTo=\"ID_295f2723-79f5-4410-99b2-5f4acb2d4f8e\" IssueInstant=\"2022-01-12T12:06:31.175Z\" Version=\"2.0\"&gt;\n&lt;saml2:Issuer\nxmlns:saml2=\"urn:oasis:names:tc:SAML:2.0:assertion\"&gt;https://accounts.google.com/o/saml2?idpid=....\n    &lt;/saml2:Issuer&gt;\n&lt;saml2p:Status&gt;\n&lt;saml2p:StatusCode Value=\"urn:oasis:names:tc:SAML:2.0:status:Success\"/&gt;\n&lt;/saml2p:Status&gt;\n&lt;saml2:Assertion\nxmlns:saml2=\"urn:oasis:names:tc:SAML:2.0:assertion\" ID=\"_befe8441fa06594b365c516558dc5636\" IssueInstant=\"2022-01-12T12:06:31.175Z\" Version=\"2.0\"&gt;\n&lt;saml2:Issuer&gt;https://accounts.google.com/o/saml2?idpid=...&lt;/saml2:Issuer&gt;\n&lt;ds:Signature\nxmlns:ds=\"http://www.w3.org/2000/09/xmldsig#\"&gt;\n&lt;ds:SignedInfo&gt;\n&lt;ds:CanonicalizationMethod Algorithm=\"http://www.w3.org/2001/10/xml-exc-c14n#\"/&gt;\n&lt;ds:SignatureMethod Algorithm=\"http://www.w3.org/2001/04/xmldsig-more#rsa-sha256\"/&gt;\n&lt;ds:Reference URI=\"#_befe8441fa06594b365c516558dc5636\"&gt;\n&lt;ds:Transforms&gt;\n&lt;ds:Transform Algorithm=\"http://www.w3.org/2000/09/xmldsig#enveloped-signature\"/&gt;\n&lt;ds:Transform Algorithm=\"http://www.w3.org/2001/10/xml-exc-c14n#\"/&gt;\n&lt;/ds:Transforms&gt;\n&lt;ds:DigestMethod Algorithm=\"http://www.w3.org/2001/04/xmlenc#sha256\"/&gt;\n&lt;ds:DigestValue&gt;QxNCjtz9Gomv2qaz8Rb4X8cQJOSGkK+87CrHDkBPidM=&lt;/ds:DigestValue&gt;\n&lt;/ds:Reference&gt;\n&lt;/ds:SignedInfo&gt;\n&lt;ds:SignatureValue&gt;...&lt;/ds:SignatureValue&gt;\n&lt;ds:KeyInfo&gt;\n&lt;ds:X509Data&gt;\n&lt;ds:X509SubjectName&gt;ST=California,C=US,OU=Google For Work,CN=Google,L=Mountain View,O=Google Inc.&lt;/ds:X509SubjectName&gt;\n&lt;ds:X509Certificate&gt;...&lt;/ds:X509Certificate&gt;\n&lt;/ds:X509Data&gt;\n&lt;/ds:KeyInfo&gt;\n&lt;/ds:Signature&gt;\n&lt;saml2:Subject&gt;\n&lt;saml2:NameID Format=\"urn:oasis:names:tc:SAML:2.0:nameid-format:persistent\"&gt;john@example.com&lt;/saml2:NameID&gt;\n&lt;saml2:SubjectConfirmation Method=\"urn:oasis:names:tc:SAML:2.0:cm:bearer\"&gt;\n&lt;saml2:SubjectConfirmationData InResponseTo=\"ID_295f2723-79f5-4410-99b2-5f4acb2d4f8e\" NotOnOrAfter=\"2022-01-12T12:11:31.175Z\" Recipient=\"https://.../auth/realms/runai/broker/saml/endpoint\"/&gt;\n&lt;/saml2:SubjectConfirmation&gt;\n&lt;/saml2:Subject&gt;\n&lt;saml2:Conditions NotBefore=\"2022-01-12T12:01:31.175Z\" NotOnOrAfter=\"2022-01-12T12:11:31.175Z\"&gt;\n&lt;saml2:AudienceRestriction&gt;\n&lt;saml2:Audience&gt;runai-jtqee5v8ob&lt;/saml2:Audience&gt;\n&lt;/saml2:AudienceRestriction&gt;\n&lt;/saml2:Conditions&gt;\n&lt;saml2:AttributeStatement&gt;\n&lt;saml2:Attribute Name=\"email\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;john@example.com\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"GID\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;8765\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"SUPPLEMENTARYGROUPS\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;200\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;300\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;400\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;100\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"UID\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;4321\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;/saml2:AttributeStatement&gt;\n&lt;saml2:AuthnStatement AuthnInstant=\"2022-01-12T12:06:30.000Z\" SessionIndex=\"_befe8441fa06594b365c516558dc5636\"&gt;\n&lt;saml2:AuthnContext&gt;\n&lt;saml2:AuthnContextClassRef&gt;urn:oasis:names:tc:SAML:2.0:ac:classes:unspecified&lt;/saml2:AuthnContextClassRef&gt;\n&lt;/saml2:AuthnContext&gt;\n&lt;/saml2:AuthnStatement&gt;\n&lt;/saml2:Assertion&gt;\n&lt;/saml2p:Response&gt;\n</code></pre> <p>Check in the above that:</p> <ul> <li>The content of the <code>&lt;saml2:Audience&gt;</code> tag is the same as <code>Entity ID</code> defined above.</li> <li>The <code>Destination</code> at the top is the same as the <code>Redirect URI</code>.</li> <li>The user email under the <code>&lt;saml2:Subject&gt;</code> tag is the same as the logged-in user.</li> <li>Make sure that under the <code>&lt;saml2:AttributeStatement&gt;</code> tag, there is an Attribute named <code>email</code> (lowercase). This attribute is mandatory.</li> <li>If other, optional attributes (such as UID, GID) are mapped, make sure they exist under <code>&lt;saml2:AttributeStatement&gt;</code> along with their respective values.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#step-2-cluster-authentication","title":"Step 2: Cluster Authentication","text":"<p>Researchers should be authenticated when accessing the Run:ai GPU Cluster. To perform that, the Kubernetes cluster and the user's Kubernetes profile must be aware of the IdP. Follow the instructions here. If you have followed these instructions in the past, you must do so again and replace the client-side and server-side configuration values with the new values as provided by on <code>Settings | General | Researcher Authentication</code>.</p>"},{"location":"admin/runai-setup/authentication/sso/#connectivity-test","title":"Connectivity test","text":"<p>Test connectivity to Run:ai command-line interface:</p> <ul> <li>In the command-line, run <code>runai login</code>.</li> <li>You will receive a link that you must copy and open in your browser. Post login you will receive a verification code which you must paste into the shell window.</li> <li>Verify successful login.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#step-3-uidgid-mapping","title":"Step 3: UID/GID Mapping","text":"<p>Configure the IdP to add UID, GID, and Supplementary groups in the IdP.</p>"},{"location":"admin/runai-setup/authentication/sso/#mapping-test","title":"Mapping test","text":"<p>Test the mapping of UID/GID to within the container:</p> <p>Submit a job with the flag <code>--run-as-user</code>, for example:</p> <pre><code>runai submit -i ubuntu --interactive --run-as-user --attach -- bash\n</code></pre> <p>When a shell opens inside the container, run <code>id</code> and verify that UID, GID, and the supplementary groups are the same as in the user's profile in the organization's directory.</p>"},{"location":"admin/runai-setup/authentication/sso/#step-4-adding-users","title":"Step 4: Adding Users","text":"<p>You can add additional users, by either:</p> <ol> <li>Manually adding roles for each user.</li> <li>Mapping roles to IdP groups.</li> </ol> <p>The latter option is easier to maintain.</p>"},{"location":"admin/runai-setup/authentication/sso/#adding-roles-for-a-user","title":"Adding Roles for a User","text":"<ul> <li>Go to <code>Settings | Users</code>.</li> <li>Select the <code>Users</code> button at the top.</li> <li>Map users as explained here.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#mapping-role-groups","title":"Mapping Role Groups","text":"<ul> <li>Go to <code>Settings | Users</code>.</li> <li>Select the <code>Groups</code> button.</li> <li>Assuming you have mapped the IdP <code>Groups</code> attribute as described in the prerequisites section above, add a name of a group that has been created in the directory and create an equivalent Run:ai Group.</li> <li>If the role group contains the <code>Researcher</code> role, you can assign this group to a Run:ai Project. All members of the group will have access to the cluster.</li> </ul> <p>Note</p> <p>This feature also works in OpenShift. If you create a group in Run:ai with the same name as an OpenShift Group, the associated permissions will be applied to all users in the group.</p>"},{"location":"admin/runai-setup/authentication/sso/#implementation-notes","title":"Implementation Notes","text":"<p>Run:ai SSO does not support single logout. As such, logging out from Run:ai will not log you out from other systems.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-delete/","title":"Deleting a Cluster Installation","text":"<p>To delete a Run:ai Cluster installation while retaining existing running jobs, run the following commands:</p> Version 2.9 or laterVersion 2.8Version 2.7 or earlier <pre><code>helm delete runai-cluster -n runai\n</code></pre> <pre><code>kubectl delete RunaiConfig runai -n runai\nhelm delete runai-cluster -n runai\n</code></pre> <pre><code>kubectl patch RunaiConfig runai -n runai -p '{\"metadata\":{\"finalizers\":[]}}' --type=\"merge\"\nkubectl delete RunaiConfig runai -n runai\nhelm delete runai-cluster runai -n runai\n</code></pre> <p>The commands will not delete existing Jobs submitted by users. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/","title":"Cluster Install","text":"<p>Below are instructions on how to install a Run:ai cluster. Before installing, please review the installation prerequisites here: Run:ai GPU Cluster Prerequisites. </p> <p>Important</p> <ul> <li>We strongly recommend running the Run:ai pre-install script to verify that all prerequisites are met. </li> <li>Starting version 2.9 you must pre-install  NGINX ingress controller</li> <li>Starting version 2.9 you must pre-install the Prometheus stack.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#install-runai","title":"Install Run:ai","text":"<p>Log in to Run:ai user interface at <code>&lt;company-name&gt;.run.ai</code>. Use credentials provided by Run:ai Customer Support:</p> <ul> <li>If no clusters are currently configured, you will see a Cluster installation wizard.</li> <li>If a cluster has already been configured, use the menu on the top left and select \"Clusters\". On the top right, click \"Add New Cluster\".</li> </ul> <p>Using the Wizard:</p> <ol> <li>Choose a target Kubernetes platform (see table above).</li> <li>Use the combo box to select your cluster version.</li> <li>(SaaS and remote self-hosted cluster only) Provide a domain name for your cluster as described here.</li> <li>(SaaS and remote self-hosted cluster only) Install a trusted certificate to the domain within Kubernetes.</li> <li>Download a Helm values YAML file <code>runai-&lt;cluster-name&gt;.yaml</code>.</li> <li>(Optional) customize the values file. See Customize Cluster Installation.</li> <li>Install Helm.</li> <li>Run the <code>helm</code> commands as provided in the wizard.</li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#verify-your-installation","title":"Verify your Installation","text":"<ul> <li>Go to <code>&lt;company-name&gt;.run.ai/dashboards/now</code>.</li> <li>Verify that the number of GPUs on the top right reflects your GPU resources on your cluster and the list of machines with GPU resources appears on the bottom line.</li> </ul> <p> Version 2.9 and up </p> <p>Run: <code>kubectl get cm runai-public -n runai -o jsonpath='{.data}' | yq -P</code></p> <p>(assumes the yq is instaled)</p> <p>Example output:</p> <pre><code>cluster-version: 2.9.0\nrunai-public: version: 2.9.0\nrunaiConfigStatus: # (1)\nconditions:\n- type: DependenciesFulfilled\nstatus: \"True\"\nreason: dependencies_fulfilled\nmessage: Dependencies are fulfilled\n- type: Deployed\nstatus: \"True\"\nreason: deployed\nmessage: Resources Deployed\n- type: Available\nstatus: \"True\"\nreason: available\nmessage: System Available\n- type: Reconciled\nstatus: \"True\"\nreason: reconciled\nmessage: Reconciliation completed successfully\noptional:  # (2)\nknative:    # (3)  \ncomponents:\nhpa:\navailable: true\nknative:\navailable: true\nkourier:\navailable: true\nmpi:        # (4) \navailable: true\n</code></pre> <ol> <li>Verifies that all mandatory dependencies are met: NVIDIA GPU Operator, Prometheus and NGINX controller. </li> <li>Checks whether optional product dependencies have been met.</li> <li>See Inference prerequisites.</li> <li>See distributed training prerequisites.</li> </ol> <p>For a more extensive verification of cluster health, see Determining the health of a cluster.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#researcher-authentication","title":"Researcher Authentication","text":"<p>You must now set up Researcher Access Control. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#optional-set-node-roles","title":"(Optional) Set Node Roles","text":"<p>When installing a production cluster you may want to:</p> <ul> <li>Set one or more Run:ai system nodes. These are nodes dedicated to Run:ai software. </li> <li>Machine learning frequently requires jobs that require CPU but not GPU. You may want to direct these jobs to dedicated nodes that do not have GPUs, so as not to overload these machines. </li> <li>Limit Run:ai to specific nodes in the cluster. </li> </ul> <p>To perform these tasks. See Set Node Roles.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#next-steps","title":"Next Steps","text":"<ul> <li>Set up Run:ai Users Working with Users.</li> <li>Set up Projects for Researchers Working with Projects.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See  Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenance scenarios.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/","title":"Prerequisites","text":"<p>Below are the prerequisites of a cluster installed with Run:ai. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prerequisites-in-a-nutshell","title":"Prerequisites in a Nutshell","text":"<p>The following is a checklist of the Run:ai prerequisites:</p> Prerequisite Details Kubernetes Verify certified vendor and correct version. NVIDIA GPU Operator Different Kubernetes flavors have slightly different setup instructions.   Verify correct version. Ingress Controller Install and configure NGINX (some Kubernetes flavors have NGINX pre-installed). Prometheus Install Prometheus. Trusted domain name You must provide a trusted domain name. Accessible only inside the organization (Optional) Distributed Training Install Kubeflow Training Operator if required. (Optional) Inference Some third party software needs to be installed to use the Run:ai inference module. <p>There are also specific hardware, operating system and network access requirements. A pre-install script is available to test if the prerequisites are met before installation. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#software-requirements","title":"Software Requirements","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#operating-system","title":"Operating System","text":"<ul> <li>Run:ai will work on any Linux operating system that is supported by both Kubernetes and NVIDIA. </li> <li>An important highlight is that GKE (Google Kubernetes Engine) will only work with Ubuntu, as NVIDIA does not support the default Container-Optimized OS with Containerd image.</li> <li>Run:ai performs its internal tests on Ubuntu 20.04 and CoreOS for OpenShift. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes","title":"Kubernetes","text":"<p>Run:ai requires Kubernetes. Run:ai is been certified with the following Kubernetes distributions: </p> Kubernetes Distribution Description Installation Notes Vanilla Kubernetes Using no specific distribution but rather k8s native installation See instructions for a simple (non-production-ready) Kubernetes Installation script. OCP OpenShift Container Platform The Run:ai operator is certified for OpenShift by Red Hat. EKS Amazon Elastic Kubernetes Service AKS Azure Kubernetes Services GKE Google Kubernetes Engine RKE Rancher Kubernetes Engine When installing Run:ai, select On Premise. RKE2 has a defect which requires a specific installation flow. Please contact Run:ai customer support for additional details. Bright NVIDIA Bright Cluster Manager In addition, NVIDIA DGX comes bundled with Run:ai <p>Run:ai has been tested with the following Kubernetes distributions. Please contact Run:ai Customer Support for up to date certification details: </p> Kubernetes Distribution Description Installation Notes Ezmeral HPE Ezmeral Container Platform See Run:ai at Ezmeral marketplace Tanzu VMWare Kubernetes Tanzu supports containerd rather than docker. See the NVIDIA prerequisites below as well as cluster customization for changes required for containerd <p>Following is a Kubernetes support matrix for the latest Run:ai releases:</p> Run:ai version Supported Kubernetes versions Supported OpenShift versions Run:ai 2.9 1.21 through 1.26 4.8 through 4.11 Run:ai 2.10 1.21 through 1.26 (see note below) 4.8 through 4.11 Run:ai 2.12 1.23 through 1.27 (see note below) 4.10 through 4.12 Run:ai 2.13 1.23 through 1.27 (see note below) 4.10 through 4.12 <p>Note</p> <p>Run:ai allows scheduling of Jobs with PVCs. See for example the command-line interface flag --pvc-new. A Job scheduled with a PVC based on a specific type of storage class (a storage class with the property <code>volumeBindingMode</code> equals to <code>WaitForFirstConsumer</code>) will not work on Kubernetes 1.23 or lower.</p> <p>For an up-to-date end-of-life statement of Kubernetes see Kubernetes Release History.</p> <p>Run:ai does not support Pod Security Admission. Support for Pod Security Policy has been removed with Run:ai 2.9.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#nvidia","title":"NVIDIA","text":"<p>Run:ai has been certified on NVIDIA GPU Operator  22.9 to 23.3. Older versions (1.10 and 1.11) have a documented NVIDIA issue. Follow the Getting Started guide to install the NVIDIA GPU Operator, or see the distribution-specific instructions below:</p> EKSGKERKE2 <ul> <li>When setting up EKS, do not install the NVIDIA device plug-in  (as we want the NVIDIA GPU Operator to install it instead). When using the eksctl tool to create an AWS EKS cluster, use the flag <code>--install-nvidia-plugin=false</code> to disable this install.</li> <li>Follow the Getting Started guide to install the NVIDIA GPU Operator. For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: <code>--set driver.enabled=false</code>. </li> </ul> <p>Create the <code>gpu-operator</code> namespace by running</p> <pre><code>kubectl create ns gpu-operator\n</code></pre> <p>Before installing the GPU Operator you must create the following file:</p> resourcequota.yaml<pre><code>apiVersion: v1\nkind: ResourceQuota\nmetadata:\nname: gcp-critical-pods\nnamespace: gpu-operator\nspec:\nscopeSelector:\nmatchExpressions:\n- operator: In\nscopeName: PriorityClass\nvalues:\n- system-node-critical\n- system-cluster-critical\n</code></pre> <p>Then run: <code>kubectl apply -f resourcequota.yaml</code></p> <p>Important</p> <ul> <li>Run:ai on GKE has only been tested with GPU Operator version 22.9 and up.</li> <li>The above only works for Run:ai 2.7.16 and above. </li> </ul> <p>Install the NVIDIA GPU Operator as discussed here.</p> <p>Notes</p> <ul> <li>Use the default namespace <code>gpu-operator</code>. Otherwise, you must specify the target namespace using the flag <code>runai-operator.config.nvidiaDcgmExporter.namespace</code> as described in customized cluster installation.</li> <li>NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags <code>--set driver.enabled=false</code>. DGX OS is one such example as it comes bundled with NVIDIA Drivers. </li> <li>To work with containerd (e.g. for Tanzu), use the defaultRuntime flag accordingly.</li> <li>To use Dynamic MIG, the GPU Operator must be installed with the flag <code>mig.strategy=mixed</code>. If the GPU Operator is already installed, edit the clusterPolicy by running <code>kubectl patch clusterPolicy cluster-policy -n gpu-operator --type=merge -p '{\"spec\":{\"mig\":{\"strategy\": \"mixed\"}}}</code></li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#ingress-controller","title":"Ingress Controller","text":"<p>Run:ai requires an ingress controller as a prerequisite. The Run:ai cluster installation configures one or more ingress objects on top of the controller. </p> <p>There are many ways to install and configure an ingress controller and configuration is environment-dependent. A simple solution is to install &amp; configure NGINX:</p> On PremRKEManaged Kubernetes <pre><code>helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm upgrade -i nginx-ingress ingress-nginx/ingress-nginx   \\\n--namespace nginx-ingress --create-namespace \\\n--set controller.kind=DaemonSet \\\n--set controller.service.externalIPs=\"{&lt;INTERNAL-IP&gt;,&lt;EXTERNAL-IP&gt;}\" # (1)\n</code></pre> <ol> <li>External and internal IP of one of the nodes</li> </ol> <p>RKE and RKE2 come pre-installed with NGINX. No further action needs to be taken. </p> <p>For managed Kubernetes such as EKS: </p> <pre><code>helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n--namespace nginx-ingress --create-namespace </code></pre> <p>For support of ingress controllers different than NGINX please contact Run:ai customer support. </p> <p>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, there is no need to install an ingress controller as it is pre-installed by the control plane.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#cluster-url","title":"Cluster URL","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#cluster-ip","title":"Prerequisites","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#domain-name","title":"Prerequisites","text":"<p>The Run:ai cluster creation wizard requires a domain name (FQDN) to the Kubernetes cluster as well as a trusted certificate for that domain. The domain name needs to be accessible inside the organization only.</p> <p>Use an HTTPS-based domain (e.g. https://my-cluster.com) as the cluster URL. Make sure that the DNS is configured with the cluster IP.</p> <p>In addition, to configure HTTPS for your URL, you must create a TLS secret named <code>runai-cluster-domain-tls-secret</code> in the <code>runai</code> namespace. The secret should contain a trusted certificate for the domain:</p> <pre><code>kubectl create ns runai\nkubectl create secret tls runai-cluster-domain-tls-secret -n runai \\\n--cert /path/to/fullchain.pem  \\ # (1)\n--key /path/to/private.pem # (2)\n</code></pre> <ol> <li>The domain's cert (public key).</li> <li>The domain's private key. </li> </ol> <p>For more information on how to create a TLS secret see: https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets.</p> <p>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, the cluster URL need not be provided as it will be the same as the control-plane URL. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prometheus","title":"Prometheus","text":"<p>If not already installed on your cluster, install the full <code>kube-prometheus-stack</code> through the Prometheus community Operator. </p> <p>Note</p> <ul> <li>If Prometheus has been installed on the cluster in the past, even if it was uninstalled (such as when upgrading from Run:ai 2.8 or lower), you will need to update Prometheus CRDs as described here. For more information on the  Prometheus bug see here.</li> <li>If you are running Kubernetes 1.21, you must install a Prometheus stack version of 45.23.0 or lower. Use the <code>--version</code> flag below. Alternatively, use helm version 3.12 or later. For more information on the related Prometheus bug see here</li> </ul> <p>Then install the Prometheus stack by running:</p> <pre><code>helm repo add prometheus-community https://prometheus-community.github.io/helm-charts\nhelm repo update\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n-n monitoring --create-namespace --set grafana.enabled=false # (1)\n</code></pre> <ol> <li>The Grafana component is not required for Run:ai. </li> </ol>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#optional-software-requirements","title":"Optional Software Requirements","text":"<p>The following software enables specific features of Run:ai</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training","title":"Distributed Training","text":"<p>Run:ai supports three different methods to distributed-training jobs across multiple nodes:</p> <ul> <li>MPI</li> <li>TensorFlow</li> <li>PyTorch</li> </ul> <p>To install all 3 prerequisites run the following:</p> <pre><code>kubectl apply -k \"github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0\"\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference","title":"Inference","text":"<p>To use the Run:ai inference module you must pre-install Knative Serving. Follow the instructions here to install. Run:ai is certified on Knative 1.4 to 1.8 with Kubernetes 1.22 or later.  </p> <p>Post-install, you must configure Knative to use the Run:ai scheduler and allow pod affinity, by running: </p> <pre><code>kubectl patch configmap/config-features \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"kubernetes.podspec-schedulername\":\"enabled\",\"kubernetes.podspec-affinity\":\"enabled\"}}'\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference-autoscaling","title":"Inference Autoscaling","text":"<p>Run:ai allows to autoscale a deployment according to various metrics:</p> <ol> <li>GPU Utilization (%)</li> <li>CPU Utilization (%)</li> <li>Latency (milliseconds)</li> <li>Throughput (requests/second)</li> <li>Concurrency </li> <li>Any custom metric</li> </ol> <p>Additional installation may be needed for some of the metrics as follows:</p> <ul> <li>Using Throughput or Concurrency does not require any additional installation.</li> <li>Any other metric will require installing the HPA Autoscaler.</li> <li>Using GPU Utilization, Latency or Custom metric will also require the Prometheus adapter. The Prometheus adapter is part of the Run:ai installer and can be added by setting the <code>prometheus-adapter.enabled</code> flag to <code>true</code>. See Customizing the Run:ai installation for further information.</li> </ul> <p>If you wish to use an existing Prometheus adapter installation, you will need to configure it manually with the Run:ai Prometheus rules, specified in the Run:ai chart values under <code>prometheus-adapter.rules</code> field. For further information please contact Run:ai customer support. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#accessing-inference-from-outside-the-cluster","title":"Accessing Inference from outside the Cluster","text":"<p>Inference workloads will typically be accessed by consumers residing outside the cluster. You will hence want to provide consumers with a URL to access the workload. The URL can be found in the Run:ai user interface under the deployment screen (alternatively, run <code>kubectl get ksvc -n &lt;project-namespace&gt;</code>). </p> <p>However, for the URL to be accessible outside the cluster you must configure your DNS as described here.</p> Alternative Configuration <p>When the above DNS configuration is not possible, you can manually add the <code>Host</code> header to the REST request as follows:</p> <ul> <li>Get an <code>&lt;external-ip&gt;</code> by running <code>kubectl get service -n kourier-system kourier</code>. If you have been using istio during Run:ai installation, run:  <code>kubectl -n istio-system get service istio-ingressgateway</code> instead. </li> <li>Send a request to your workload by using the external ip, and place the workload url as a <code>Host</code> header. For example</li> </ul> <pre><code>curl http://&lt;external-ip&gt;/&lt;container-specific-path&gt;\n    -H 'Host: &lt;host-name&gt;'\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>(see picture below)</p> <ul> <li> <p>(Production only) Run:ai System Nodes: To reduce downtime and save CPU cycles on expensive GPU Machines, we recommend that production deployments will contain two or more worker machines, designated for Run:ai Software. The nodes do not have to be dedicated to Run:ai, but for Run:ai purposes we would need:</p> <ul> <li>8 CPUs</li> <li>16GB of RAM</li> <li>50GB of Disk space  </li> </ul> </li> <li> <p>Shared data volume: Run:ai uses Kubernetes to abstract away the machine on which a container is running:</p> <ul> <li>Researcher containers: The Researcher's containers need to be able to access data from any machine in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts. </li> <li>The Run:ai system needs to save data on a storage device that is not dependent on a specific node.  </li> </ul> <p>Typically, this is achieved via Network File Storage (NFS) or Network-attached storage (NAS).</p> </li> <li> <p>Docker Registry: With Run:ai, Workloads are based on Docker images. For container images to run on any machine, these images must be downloaded from a docker registry rather than reside on the local machine (though this also is possible). You can use a public registry such as docker hub or set up a local registry on-prem (preferably on a dedicated machine). Run:ai can assist with setting up the repository.</p> </li> <li> <p>Kubernetes: Production Kubernetes installation requires separate nodes for the Kubernetes master. For more details see your specific Kubernetes distribution documentation. </p> </li> </ul> <p></p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#user-requirements","title":"User requirements","text":"<p>Usage of containers and images: The individual Researcher's work must be based on container images. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#network-access-requirements","title":"Network Access Requirements","text":"<p>Internal networking: Kubernetes networking is an add-on rather than a core part of Kubernetes. Different add-ons have different network requirements. You should consult the documentation of the specific add-on on which ports to open. It is however important to note that unless special provisions are made, Kubernetes assumes all cluster nodes can interconnect using all ports. </p> <p>Outbound network: Run:ai user interface runs from the cloud. All container nodes must be able to connect to the Run:ai cloud. Inbound connectivity (connecting from the cloud into nodes) is not required. If outbound connectivity is limited, the following exceptions should be applied: </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#during-installation","title":"During Installation","text":"<p>Run:ai requires an installation over the Kubernetes cluster. The installation access the web to download various images and registries. Some organizations place limitations on what you can pull from the internet. The following list shows the various solution components and their origin: </p> Name Description URLs Ports <p>Run:ai  Repository</p> <p> Run:ai Helm Package Repository </p> <p> runai-charts.storage.googleapis.com </p> <p>443</p> <p>Docker Images Repository</p> <p>Run:ai images</p>  gcr.io/run-ai-prod  <p>443</p> <p> Docker Images Repository </p> <p> Third party Images</p> <p>hub.docker.com </p> <p>quay.io </p> <p>  443   </p> <p> Run:ai </p> <p> Run:ai   Cloud instance </p> <p> app.run.ai </p> <p> </p> <p>443, 53</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#post-installation","title":"Post Installation","text":"<p>In addition, once running, Run:ai requires an outbound network connection to the following targets:</p> Name Description URLs Ports <p>Grafana</p> <p>Grafana Metrics Server</p> <p> prometheus-us-central1.grafana.net and runailabs.com </p> <p>443 </p> <p> Run:ai </p> <p> Run:ai   Cloud instance </p> <p> app.run.ai </p> <p> </p> <p>443, 53</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#network-proxy","title":"Network Proxy","text":"<p>If you are using a Proxy for outbound communication please contact Run:ai customer support</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt;\n</code></pre> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/","title":"SaaS Cluster Setup Introduction","text":"<p>This section is a step-by-step guide for setting up a Run:ai cluster. </p> <ul> <li>A Run:ai cluster is installed on top of a Kubernetes cluster.</li> <li>A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.</li> <li>A customer may have multiple Run:ai Clusters, all connecting to a single control plane.</li> </ul> <p>For additional details see the Run:ai system components</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#documents","title":"Documents","text":"<ul> <li>Review Run:ai cluster prerequisites.</li> <li>Step-by-step installation instructions.</li> <li>Look for troubleshooting tips if required.</li> <li>Upgrade cluster and delete cluster instructions. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#customization","title":"Customization","text":"<p>For a list of optional customizations see Customize Installation</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#additional-configuration","title":"Additional Configuration","text":"<p>For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#next-steps","title":"Next Steps","text":"<p>After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/","title":"Upgrading a Cluster Installation","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#find-out-runai-cluster-version","title":"Find out Run:ai Cluster version","text":"<p>To find the Run:ai cluster version, run:</p> <pre><code>helm list -n runai -f runai-cluster\n</code></pre> <p>and record the chart version in the form of <code>runai-cluster-&lt;version-number&gt;</code></p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-runai-cluster","title":"Upgrade Run:ai cluster","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-from-version-29-or-later","title":"Upgrade from version 2.9 or later","text":"<p>Run:</p> <pre><code>helm repo update\nhelm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster runai/runai-cluster -n runai -f values.yaml\n</code></pre> <p>Note</p> <p>To upgrade to a specific version of the Run:ai cluster, add <code>--version &lt;version-number&gt;</code> to the <code>helm upgrade</code> command. You can find the relevant version with <code>helm search repo</code> as described above. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-from-version-28-or-earlier","title":"Upgrade from version 2.8 or earlier","text":"<p>The process of upgrading from 2.7 or 2.8 requires uninstalling and then installing again. No data is lost during the process. </p> <p>Note</p> <p>The reason for this process is that Run:ai 2.9 cluster installation no longer installs pre-requisites. As such ownership of dependencies such as Prometheus will be undefined if a <code>helm upgrade</code> is run.</p> <p>The process:</p> <ul> <li>Delete the Run:ai cluster installation according to these instructions (do not delete the Run:ai cluster object from the user interface).</li> <li>Run: <code>kubectl delete svc -n kube-system runai-cluster-kube-prometh-kubelet</code> </li> <li> <p>Install the mandatory Run:ai prerequisites:</p> <ul> <li>If you have previously installed the SaaS version of Run:ai version 2.7 or below, you will need to install both Ingress Controller and Prometheus.</li> <li>If you have previously installed the SaaS version of Run:ai version 2.8 or any Self-hosted version of Run:ai, you will need to install Prometheus only.</li> </ul> </li> <li> <p>Install Run:ai cluster as described here</p> </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#verify-successful-installation","title":"Verify Successful Installation","text":"<p>See Verify your installation on how to verify a Run:ai cluster installation</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/","title":"(Optional) Customize Cluster Installation","text":"<p>The Run:ai cluster creation wizard requires the download of a Helm values file <code>runai-&lt;cluster-name&gt;.yaml</code>. The file may be edited to customize the cluster installation.</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#configuration-flags","title":"Configuration Flags","text":"Key Default Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code>if unwilling to provide Run:ai the ability to create namespaces. When set to false, will requires an additional manual step when creating new Run:ai Projects <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> when using PodSecurityPolicy or OpenShift <code>runai-operator.config.mps-server.enabled</code> <code>false</code> Set to <code>true</code> to allow the use of NVIDIA MPS. MPS is useful with Inference workloads <code>runai-operator.config.global.runtime</code> <code>docker</code> Defines the container runtime of the cluster (supports <code>docker</code> and <code>containerd</code>). Set to <code>containerd</code> when using Tanzu <code>runai-operator.config.global.nvidiaDcgmExporter.namespace</code> <code>gpu-operator</code> The namespace where dcgm-exporter (or gpu-operator) was installed <code>runai-operator.config.global.nvidiaDcgmExporter.installedFromGpuOperator</code> <code>true</code> Indicated whether the dcgm-exporter was installed via gpu-operator or not <code>kube-prometheus-stack.enabled</code> <code>true</code> (Version 2.8 or lower)  Set to <code>false</code> when the cluster has an existing Prometheus installation that is not based on the Prometheus operator. This setting requires Run:ai customer support <code>kube-prometheus-stack.prometheusOperator.enabled</code> <code>true</code> (Version 2.8 or lower)  Set to <code>false</code> when the cluster has an existing Prometheus installation based on the Prometheus operator and Run:ai should use the existing one rather than install a new one <code>prometheus-adapter.enabled</code> <code>false</code> (Version 2.8 or lower) Install Prometheus Adapter. Used for Inference workloads using a custom metric for autoscaling. Set to <code>true</code> if Prometheus Adapter is not already installed in the cluster <code>prometheus-adapter.prometheus</code> The address of the default Prometheus Service (Version 2.8 or lower) If you installed your own custom Prometheus Service, set this field accordingly with <code>url</code> and <code>port</code>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#prometheus","title":"Prometheus","text":"Version 2.9 or higherVersion 2.8 or lower <p>Not relevant</p> <p>The Run:ai Cluster installation uses Prometheus. There are 3 alternative configurations:</p> <ol> <li>Run:ai installs Prometheus (default).</li> <li>Run:ai uses an existing Prometheus installation based on the Prometheus operator.</li> <li>Run:ai uses an existing Prometheus installation based on a regular Prometheus installation.</li> </ol> <p>For option 2, disable the flag <code>kube-prometheus-stack.prometheusOperator.enabled</code>. For option 3, please contact Run:ai Customer support. </p> <p>For options 2 and 3, if you enabled <code>prometheus-adapter</code>, please configure it as described in the Prometheus Adapter documentation</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#understanding-custom-access-roles","title":"Understanding Custom Access Roles","text":"<p>To review the access roles created by the Run:ai Cluster installation, see Understanding Access Roles.</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#manual-creation-of-namespaces","title":"Manual Creation of Namespaces","text":"<p>Run:ai Projects are implemented as Kubernetes namespaces. By default, the administrator creates a new Project via the Administration user interface which then triggers the creation of a Kubernetes namespace named <code>runai-&lt;PROJECT-NAME&gt;</code>. There are a couple of use cases that customers will want to disable this feature:</p> <ul> <li>Some organizations prefer to use their internal naming convention for Kubernetes namespaces, rather than Run:ai's default <code>runai-&lt;PROJECT-NAME&gt;</code> convention.</li> <li>Some organizations will not allow Run:ai to automatically create Kubernetes namespaces. </li> </ul> <p>Follow these steps to achieve this:</p> <ol> <li>Disable the namespace creation functionality. See the  <code>runai-operator.config.project-controller.createNamespaces</code> flag above.</li> <li>Create a Project using the Run:ai User Interface. </li> <li>Create the namespace if needed by running: <code>kubectl create ns &lt;NAMESPACE&gt;</code>. The suggested Run:ai default is <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Label the namespace to connect it to the Run:ai Project by running <code>kubectl label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;</code>, where <code>&lt;PROJECT_NAME&gt;</code> is the name of the project you have created in the Run:ai user interface above and <code>&lt;NAMESPACE&gt;</code> is the name you chose for your namespace.</li> </ol>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/","title":"NVIDIA DGX Bundle","text":""},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-nvidia-dgx-bundle","title":"Run:ai &amp; NVIDIA DGX Bundle","text":"<p>NVIDIA DGX is a line of NVIDIA-produced servers and workstations which specialize in using GPUs to accelerate deep learning applications.</p> <p>NVIDIA DGX comes bundled out of the box with Run:ai. The purpose of this document is to guide you through the process of installing and configuring Run:ai in this scenario</p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#nvidia-bright-cluster-manager","title":"NVIDIA Bright Cluster Manager","text":"<p>NVIDIA Bright Cluster Manager allows the deployment of software on NVIDIA DGX servers. During the installation of the DGX you will select <code>Run:ai</code> as well as Run:ai prerequisites from the Bright installer.</p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#prerequisites","title":"Prerequisites","text":""},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#software-prerequisites","title":"Software Prerequisites","text":"<p>Run:ai assumes the following components to be pre-installed:</p> <ul> <li><code>NVIDIA GPU Operator</code> - available for installation via the bright installer</li> <li><code>Prometheus</code> - available for installation via the bright installer</li> <li><code>Ingress controller</code> - NGINX is available for installation via the bright installer. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-prerequisites","title":"Run:ai prerequisites","text":"<p>The Run:ai cluster installer will require the following:</p> <ul> <li><code>Run:ai tenant name</code> - provided by Run:ai customer support.</li> <li><code>Run:ai install secret</code> - provided by Run:ai customer support.</li> <li><code>Cluster URL</code> - your organization should provide you with a domain name.</li> <li><code>Private and public keys</code> -your organization should provide a trusted certificate for the above domain name. The Run:ai installer will require both private key and full-chain in PEM format. </li> <li>Post-installation - credentials for the Run:ai user interface. Provided by Run:ai customer support.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installing-runai-installer","title":"Installing Run:ai installer","text":"<p>Select Run:ai via the bright installer. Remember to select all of the above software prerequisites as well. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#using-the-runai-installer","title":"Using the Run:ai installer","text":"<p>Find out the cluster's IP address. Then browse to <code>http://&lt;CLUSTER-IP&gt;:30080/runai-installer</code>. Alternatively use the Bright landing page at <code>http://&lt;CLUSTER-IP&gt;/#runai</code>.  </p> <p>Note</p> <ul> <li>Use <code>http</code> rather than <code>https</code>.</li> <li>Use the IP and not a domain name.</li> </ul> <p>A wizard would open up containing 3 pages: Prerequisites, setup, and installation. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#prerequisites-page","title":"Prerequisites Page","text":"<p>The first, verification page, verifies that all of the above software prerequisites are met. Press the \"Verify\" button. You will not be able to continue unless all prerequisites are met. When all are met, press the <code>Continue</code> button. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#setup-page","title":"Setup Page","text":"<p>The setup page asks to provide all of the Run:ai prerequisites described above. The page will verify the Run:ai input (tenant name and install secret) but will not verify the validity of the cluster URL and certificate. If those are incorrect, the Run:ai installation will show as successful but certain aspects of Run:ai will not work. </p> <p>After filling up the form, press <code>Continue</code>. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation-page","title":"Installation page","text":"<p>The Run:ai installation will start. Depending on your download network speed the installation can take from 2 to 10 minutes. When the installation is successful you will see a <code>START USING RUN:AI</code> button. Press the button and enter your credentials to enter the Run:ai user interface. </p> <p>Save the URL for future use. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#post-installation","title":"Post-installation.","text":"<p>Post installation, you will want to:</p> <ul> <li>(Mandatory) Set up Researcher Access Control. Without this, the Job Submit form will not work. Note the Bright section in that document.</li> <li>Set up Run:ai Users Working with Users.</li> <li>Set up Projects for Researchers Working with Projects.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#troubleshooting","title":"Troubleshooting","text":"<p>The cluster installer is a pod in Kubernetes. The pod is responsible for the installation preparation and prerequisite gathering phase. In case of an error during this pre-installation, you need to gather the pod's log. </p> <p>Once the Run:ai cluster installation has started, the behavior is identical to any Run:ai cluster installation flavor. See the troubleshooting page.</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/","title":"Native Kubernetes Installation","text":"<p>Kubernetes is composed of master(s) and workers. The instructions and script below are for creating a bare-bones installation of a single master and several workers for testing purposes. For a more complex, production-grade, Kubernetes installation, use tools such as Rancher Kubernetes Engine, or review Kubernetes documentation to learn how to customize the native installation.</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#prerequisites","title":"Prerequisites:","text":"<ul> <li>The script below assumes all machines have Ubuntu 18.04 or later. For other Linux-based operating-systems see Kubernetes documentation. </li> <li>The script must be run with ROOT privileges.</li> <li>Inbound ports 6443,443,8080 must be allowed. </li> <li>The script support Kubernetes 1.24 or later.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes","title":"Install Kubernetes","text":""},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes-master","title":"Install Kubernetes Master","text":"<ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to install Kubernetes master.</li> <li>Select the Kubernetes version you want or press <code>Enter</code> for the default script version. </li> <li>Select the CNI (networking) version or press <code>Enter</code> for the default.</li> </ul> <p>When the script finishes, it will prompt a join command_ to be run on all workers. Save the command for later use.</p> <p>Note</p> <p>The default token expires after 24 hours. If the token has expired, go to the master node and run <code>sudo kubeadm token create --print-join-command</code>. This will produce an up-to-date join command.</p> <p>Test that Kubernetes is up by running: <pre><code>kubectl get nodes\n</code></pre> Verify that the master node is ready</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes-workers","title":"Install Kubernetes Workers","text":"<p>On each designated worker node:</p> <ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to install Kubernetes worker.</li> <li>Select the Kubernetes version you want or press <code>Enter</code> for the default script version. The version should be the same as the one selected for the Kubernetes master. </li> </ul> <p>When the script finishes, run the join command saved above. </p> <p>To test that the worker has successfully joined, on the master node run: <pre><code>kubectl get nodes\n</code></pre> Verify that the new worker node is showing and ready (may take a couple of seconds).</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#avoiding-accidental-upgrades","title":"Avoiding Accidental Upgrades","text":"<p>To avoid an accidental upgrade of Kubernetes binaries during Linux upgrades, it is recommended to hold the version. Run the following on all nodes:</p> <pre><code>sudo apt-mark hold kubeadm kubelet kubectl\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#next-steps","title":"Next Steps","text":"<p>The administrative Kubernetes profile can be found in the master node under the <code>.kube</code> folder. </p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#reset-nodes","title":"Reset Nodes","text":"<p>The same script also contains an option to completely remove Kubernetes from nodes (master or workers). To use, run: </p> <ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to reset/delete kubernetes.</li> <li>Select yes when prompted to reset the cluster and remove Kubernetes packages.</li> </ul>"},{"location":"admin/runai-setup/config/access-roles/","title":"Understand the Kubernetes Cluster Access provided to Run:ai","text":"<p>Run:ai has configuration flags that control specific behavioral aspects of Run:ai. Specifically, those which require additional permissions. Such as automatic namespace/project creation, secret propagation, and more.</p> <p>The purpose of this document is to provide security officers with the ability to review what cluster-wide access Run:ai requires, and verify that it is in line with organizational policy, before installing the Run:ai cluster. </p>"},{"location":"admin/runai-setup/config/access-roles/#review-cluster-access-roles","title":"Review Cluster Access Roles","text":"<p>Run the folloinwg:</p> <pre><code>helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\nhelm install runai-cluster runai/runai-cluster -n runai -f runai-&lt;cluster-name&gt;.yaml \\\n        --dry-run &gt; cluster-all.yaml\n</code></pre> <p>The file <code>cluster-all.yaml</code> can be then be reviewed. You can use the internal filenames (provided in comments within the file) to gain more understanding according to the table below:</p> Folder File Purpose <code>clusterroles</code> <code>base.yaml</code> Mandatory Kubernetes Cluster Roles and Cluster Role Bindings <code>clusterroles</code> <code>project-controller-ns-creation.yaml</code> Automatic Project Creation and Maintenance. Provides Run:ai with the ability to create Kubernetes namespaces when the Run:ai administrator creates new Projects. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-rb-creation.yaml</code> Automatically assign Users to Projects. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-cluster-wide-secrets.yaml</code> Allow the propagation of Secrets. See Secrets in Jobs. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-limit-range.yaml</code> Disables the usage of the Kubernetes Limit Range feature. Can be turned on/off via flag <code>ocp</code> <code>scc.yaml</code> OpenShift-specific Security Contexts <code>priorityclasses</code> 4 files Folder contains a list of Priority Classes used by Run:ai"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/","title":"External access to Containers","text":""},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#introduction","title":"Introduction","text":"<p>Researchers working with containers may at times need to remotely access the container. Some examples:</p> <ul> <li>Using a Jupyter notebook that runs within the container</li> <li>Using PyCharm to run python commands remotely.</li> <li>Using TensorBoard to view machine learning visualizations</li> </ul> <p>This requires exposing container ports. When using docker, the way Researchers expose ports is by declaring them when starting the container. Run:ai has similar syntax.</p> <p>Run:ai is based on Kubernetes. Kubernetes offers an abstraction of the container's location. This complicates the exposure of ports. Kubernetes offers several options:</p> Method Description Prerequisites Port Forwarding Simple port forwarding allows access to the container via local and/or remote port. None NodePort Exposes the service on each Node\u2019s IP at a static port (the NodePort). You\u2019ll be able to contact the NodePort service from outside the cluster by requesting <code>&lt;NODE-IP&gt;:&lt;NODE-PORT&gt;</code> regardless of which node the container actually resides in. None LoadBalancer Exposes the service externally using a cloud provider\u2019s load balancer. Only available with cloud providers <p>See https://kubernetes.io/docs/concepts/services-networking/service for further details on these options.</p>"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#workspaces-configuration","title":"Workspaces configuration","text":"<p> Version 2.9 and up </p> <p>Version 2.9 introduces Workspaces which allow the Researcher to build AI models interactively. </p> <p>Workspaces allow the Researcher to launch tools such as Visual Studio code, TensorFlow, TensorBoard etc. These tools require access to the container. Access is provided via URLs. </p> <p>Run:ai uses the Cluster URL provided to dynamically create SSL-secured URLs for researchers\u2019 workspaces in the format of <code>https://&lt;CLUSTER_URL&gt;/project-name/workspace-name</code>.</p> <p>While this form of path-based routing conveniently works with applications like Jupyter Notebooks, it may often not be compatible with other applications. These applications assume running at the root file system, so hardcoded file paths and settings within the container may become invalid when running at a path other than the root. For instance, if the container is expecting to find a file at <code>/etc/config.json</code> but is running at <code>/project-name/workspace-name</code>, the file will not be found. This can cause the container to fail or not function as intended.</p> <p>To address this issue, Run:ai provides support for host-based routing. When enabled, Run:ai creates workspace URLs in a subdomain format (<code>https://project-name-workspace-name.&lt;CLUSTER_URL&gt;/</code>), which allows all workspaces to run at the root path and function properly. </p> <p>To enable host-based routing you must perform the following steps:</p> <ol> <li>Create a second DNS entry  <code>*.&lt;CLUSTER_URL&gt;</code>, pointing to the same IP as the original Cluster URL DNS.</li> <li> <p>Obtain a star SSL certificate for this DNS.</p> </li> <li> <p>Add the certificate as a secret:</p> </li> </ol> SaaSSelf hosted <pre><code>kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai \\ \n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre> <pre><code>kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre> <ol> <li>Create an ingress rule to direct traffic:</li> </ol> SaaSSelf hosted <pre><code>kubectl patch ingress researcher-service-ingress -n runai --type json \\\n    --patch '[{ \"op\": \"add\", \"path\": \"/spec/tls/-\", \"value\": { \"hosts\": [ \"*.&lt;CLUSTER_URL&gt;\" ], \"secretName\": \"runai-cluster-domain-star-tls-secret\" } }]'\n</code></pre> <pre><code>kubectl patch ingress runai-backend-ingress -n runai-backend --type json \\\n    --patch '[{ \"op\": \"add\", \"path\": \"/spec/tls/-\", \"value\": { \"hosts\": [ \"*.&lt;CLUSTER_URL&gt;\" ], \"secretName\": \"runai-cluster-domain-star-tls-secret\" } }]'\n</code></pre> <ol> <li>Edit Runaiconfig to generate the URLs correctly:</li> </ol> <pre><code>kubectl patch RunaiConfig runai -n runai --type=\"merge\" \\\n    -p '{\"spec\":{\"global\":{\"subdomainSupport\": true}}}' \n</code></pre> <p>Once these requirements have been met, all workspaces will automatically be assigned a secured URL with a subdomain, ensuring full functionality for all researcher applications.</p>"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#see-also","title":"See Also","text":"<ul> <li>To learn how to use port forwarding see the Quickstart document:  Launch an Interactive Build Workload with Connected Ports.</li> <li>See CLI command runai submit.</li> </ul>"},{"location":"admin/runai-setup/config/cli-admin-install/","title":"Install the Run:ai Administrator Command-line Interface","text":"<p>The Run:ai Administrator Command-line Interface (Administrator CLI) allows performing administrative tasks on the Run:ai Cluster.  </p> <p>The instructions below will guide you through the process of installing the Administrator CLI.</p>"},{"location":"admin/runai-setup/config/cli-admin-install/#prerequisites","title":"Prerequisites","text":"<ul> <li>Run:ai Administrator CLI runs on Mac and Linux.   </li> <li>Kubectl (Kubernetes command-line interface) is installed and configured to access your cluster. Please refer to https://kubernetes.io/docs/tasks/tools/install-kubectl/</li> <li>A Kubernetes configuration file obtained from a computer previously connected to the Kubernetes cluster</li> </ul>"},{"location":"admin/runai-setup/config/cli-admin-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"<p>The Run:ai Administrator CLI requires a Kubernetes profile with cluster administrative rights. </p>"},{"location":"admin/runai-setup/config/cli-admin-install/#installation","title":"Installation","text":"<p>Download the Run:ai Administrator Command-line Interface by running:</p> MacLinux <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/darwin  # (1) \nchmod +x runai-adm\nsudo mv runai-adm /usr/local/bin/runai-adm\n</code></pre> <ol> <li>In self-hosted environment, use the control-plane URL instead of <code>app.run.ai</code> </li> </ol> <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/linux  # (1)\nchmod +x runai-adm\nsudo mv runai-adm /usr/local/bin/runai-adm\n</code></pre> <ol> <li>In self-hosted environment, use the control-plane URL instead of <code>app.run.ai</code> </li> </ol> <p>To verify the installation run:</p> <pre><code>runai-adm version\n</code></pre>"},{"location":"admin/runai-setup/config/cli-admin-install/#download-a-specific-version","title":"Download a specific version","text":"<p>To download a specific version of <code>runai-adm</code> add the version number to URL. For example:</p> <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/v2.7.22/darwin\n</code></pre>"},{"location":"admin/runai-setup/config/cli-admin-install/#updating-the-runai-administrator-cli","title":"Updating the Run:ai Administrator CLI","text":"<p>To update the CLI to the latest version perform the same install process again. The command <code>runai-adm update</code> is no longer supported.</p>"},{"location":"admin/runai-setup/config/dr/","title":"Planning for Disaster Recovery","text":"<p>The SaaS version of Run:ai moves the bulk of the burden of disaster recovery to Run:ai. Backup of data is hence not an issue in such environments. </p> <p>With the self-hosted version, it is the responsibility of the IT organization to back up data for a possible disaster and to learn how to recover when needed.</p>"},{"location":"admin/runai-setup/config/dr/#backup","title":"Backup","text":""},{"location":"admin/runai-setup/config/dr/#database","title":"Database","text":"<p>Run:ai uses an internal PostgreSQL database. The database is stored on a Kubernetes Persistent Volume (PV). You must provide a backup solution for the database. Typically by backing up the persistent volume holding the database storage.</p>"},{"location":"admin/runai-setup/config/dr/#metrics","title":"Metrics","text":"<p>Run:ai stores metric history using Thanos. Thanos is configured to store data on a persistent volume. The recommendation is to back up the PV.</p>"},{"location":"admin/runai-setup/config/dr/#additional-configuration","title":"Additional Configuration","text":"<p>During the installation of Run:ai you have created two value files:</p> <ul> <li>One for the Run:ai control plane. See Kubernetes or OpenShift,</li> <li>One for the cluster (see Kubernetes or OpenShift). </li> </ul> <p>You will want to save these files or extract a current version of the file by using the upgrade script. </p>"},{"location":"admin/runai-setup/config/dr/#recovery","title":"Recovery","text":"<p>To recover Run:ai</p> <ul> <li>Re-create the Kubernetes/OpenShift cluster.</li> <li>Recover the persistent volumes for metrics and database. </li> <li>Re-install the Run:ai control plane. Use the stored values file. If needed, modify the values file to connect to the restored PostgreSQL PV. Connect Prometheus to the stored metrics PV. </li> <li>Re-install the cluster. Use the stored values file or download a new file from the Administration UI. </li> <li>If the cluster is configured such that Projects do not create a namespace automatically, you will need to re-create namespaces and apply role bindings as discussed in Kubernetes or OpenShift.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/","title":"Node affinity with cloud node pools","text":"<p>Run:ai allows for node affinity. Node affinity is the ability to assign a Project to run on specific nodes. To use the node affinity feature, You will need to label the target nodes with the label  <code>run.ai/node-type</code>. Most cloud clusters allow configuring node labels for the node pools in the cluster. This guide shows how to apply this configuration to different cloud providers.</p> <p>To make the node affinity work with node pools on various cloud providers, we need to make sure the node pools are configured with the appropriate Kubernetes label (<code>run.ai/type=&lt;TYPE_VALUE&gt;</code>).</p>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#setting-node-labels-while-creating-a-new-cluster","title":"Setting node labels while creating a new cluster","text":"<p>You can configure node-pool labels at cluster creation time</p> GKEAKSEKS <ul> <li>At the first creation screen, you will see a menu on the left side named <code>node-pools</code>.</li> <li>Expand the node pool you want to label.</li> <li>Click on <code>Metadata</code>.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>When creating AKS cluster at the node-pools page click on create new node-pool.</li> <li>Go to the <code>labels</code> section and add key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Create a regular EKS cluster.</li> <li>Click on <code>compute</code>.</li> <li>Click on <code>Add node group</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#setting-node-labels-for-a-new-node-pool","title":"Setting node labels for a new node pool","text":"GKEAKSEKS <ul> <li>At the node pool creation screen, go to the <code>metadata</code> section.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Go to your AKS page at Azure.</li> <li>On the left menu click the <code>node-pools</code> button.</li> <li>Click on <code>Add Node Pool</code>.</li> <li>In the new Node Pool page go to <code>Optional settings</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Go to <code>Add node group</code> screen.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#editing-node-labels-for-an-existing-node-pool","title":"Editing node labels for an existing node pool","text":"GKEAKSEKS <ul> <li>Go to the <code>Google Kubernetes Engine</code> page in the Google Cloud console.</li> <li>Go to <code>Google Kubernetes Engine</code>.</li> <li>In the cluster list, click the name of the cluster you want to modify.</li> <li>Click the <code>Nodes</code> tab</li> <li>Under <code>Node Pools</code>, click the name of the node pool you want to modify, then click <code>Edit</code>.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <p>To update an existing node pool label you must use the azure cli. Run the following command:</p> <pre><code>az aks nodepool update \\\n    --resource-group [RESOURCE GROUP] \\\n    --cluster-name [CLUSTER NAME] \\\n    --name labelnp \\\n    --labels run.ai/type=[TYPE_VALUE] \\\n    --no-wait\n</code></pre> <ul> <li>Go to the <code>node group</code> page and click on <code>Edit</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-roles/","title":"Designating Specific Role Nodes","text":"<p>When installing a production cluster you may want to:</p> <ul> <li>Set one or more Run:ai system nodes. These are nodes dedicated to Run:ai software. </li> <li>Machine learning frequently requires jobs that require CPU but not GPU. You may want to direct these jobs to dedicated nodes that do not have GPUs, so as not to overload these machines. </li> <li>Limit Run:ai monitoring and scheduling to specific nodes in the cluster. </li> </ul> <p>To perform these tasks you will need the Run:ai Administrator CLI. See Installing the Run:ai Administrator Command-line Interface.</p>"},{"location":"admin/runai-setup/config/node-roles/#dedicated-runai-system-nodes","title":"Dedicated Run:ai System Nodes","text":"<p>Find out the names of the nodes designated for the Run:ai system by running <code>kubectl get nodes</code>. For each such node run:</p> <pre><code>runai-adm set node-role --runai-system-worker &lt;node-name&gt;\n</code></pre> <p>If you re-run <code>kubectl get nodes</code> you will see the node role of these nodes changed to <code>runai-system</code></p> <p>To remove the runai-system node role run:</p> <pre><code>runai-adm remove node-role --runai-system-worker &lt;node-name&gt;\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/config/node-roles/#dedicated-gpu-cpu-nodes","title":"Dedicated GPU &amp; CPU Nodes","text":"<p>Separate nodes into those that:</p> <ul> <li>Run GPU workloads</li> <li>Run CPU workloads</li> <li>Do not run Run:ai at all. these jobs will not be monitored using the Run:ai Administration User interface. </li> </ul> <p>Review nodes names using <code>kubectl get nodes</code>. For each such node run:</p> <pre><code>runai-adm set node-role --gpu-worker &lt;node-name&gt;\n</code></pre> <p>or </p> <pre><code>runai-adm set node-role --cpu-worker &lt;node-name&gt;\n</code></pre> <p>Nodes not marked as GPU worker or CPU worker will not run Run:ai at all.</p> <p>To set all workers not running runai-system as GPU workers run:</p> <pre><code>runai-adm set node-role --all &lt;node-name&gt;\n</code></pre> <p>To remove the CPU or GPU worker node role run:</p> <pre><code>runai-adm remove node-role --cpu-worker &lt;node-name&gt;\n</code></pre> <p>or </p> <pre><code>runai-adm remove node-role --gpu-worker &lt;node-name&gt;\n</code></pre>"},{"location":"admin/runai-setup/config/non-root-containers/","title":"User Identity in Container","text":"<p>The identity of the user in the container determines its access to resources. For example, network file storage solutions typically use this identity to determine the container's access to network volumes. This document explains multiple ways for propagating the user identity into the container.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#the-default-root-access","title":"The Default: Root Access","text":"<p>In docker, as well as in Kubernetes, the default for running containers is running as root. The implication of running as root is that processes running within the container have enough permissions to change anything in the container, and if propagated to network resources - can have permissions outside the container as well. </p> <p>This gives a lot of power to the Researcher but does not sit well with modern security standards of enterprise security. </p> <p>By default, if you run:</p> <p><pre><code>runai submit -i ubuntu --attach --interactive -- bash\n</code></pre> then run <code>id</code>, you will see the root user. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#use-runai-flags-to-limit-root-access","title":"Use Run:ai flags to limit root access","text":"<p>There are two runai submit flags which control user identity at the Researcher level:</p> <ul> <li>The flag <code>--run-as-user</code> starts the container with a specific user. The user is the current Linux user (see below for other behaviors if used in conjunction with Single sign-on). </li> <li>The flag <code>--prevent-privilege-escalation</code> prevents the container from elevating its own privileges into <code>root</code> (e.g. running <code>sudo</code> or changing system files.). </li> </ul> <p>Equivalent flags exist in the Researcher User Interface.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#run-as-current-user","title":"Run as Current User","text":"<p>From a Linux/Mac box, run:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user -- bash\n</code></pre> <p>then run <code>id</code>, you will see the users and groups of the box you have been using to launch the Job.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#prevent-escalation","title":"Prevent Escalation","text":"<p>From a Linux/Mac box, run:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user \\\n  --prevent-privilege-escalation  -- bash\n</code></pre> <p>then verify that you cannot run <code>su</code> to become root within the container. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#setting-a-cluster-wide-default","title":"Setting a Cluster-Wide Default","text":"<p>The two flags are voluntary. They are not enforced by the system. It is however possible to enforce them using Policies. Polices allow an Administrator to force compliance on both the User Interface and Command-line interface. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity","title":"Passing user identity","text":""},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity-from-identity-provider","title":"Passing user identity from Identity Provider","text":"<p>A best practice is to store the user identifier (UID) and the group identifier (GID) in the organization's directory. Run:ai allows you to pass these values to the container and use them as the container identity.</p> <p>To perform this, you must:</p> <ul> <li>Set up single sign-on. Perform the steps for UID/GID integration.</li> <li>Run: <code>runai login</code> and enter your credentials</li> <li>Use the flag --run-as-user</li> </ul> <p>Running <code>id</code> should show the identifier from the directory.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity-explicitly-via-the-researcher-ui","title":"Passing user identity explicitly via the Researcher UI","text":"<p>Via the Researcher User Interface, it is possible to explicitly provide the user id and group id:</p> <p></p>"},{"location":"admin/runai-setup/config/non-root-containers/#using-openshift-or-gatekeeper-to-provide-cluster-level-controls","title":"Using OpenShift or Gatekeeper to provide Cluster Level Controls","text":"<p>Run:ai supports OpenShift as a Kubernetes platform. In OpenShift the system will provide a random UID to containers. The flags <code>--run-as-user</code> and <code>--prevent-privilege-escalation</code> are disabled on OpenShift. It is possible to achieve a similar effect on Kubernetes systems that are not OpenShift. A leading tool is Gatekeeper. Gatekeeper similarly enforces non-root on containers at the system level. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#creating-a-temporary-home-directory","title":"Creating a Temporary Home Directory","text":"<p>When containers run as a specific user, the user needs to have a pre-created home directory within the image. Otherwise, when running a shell, you will not have a home directory:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user -- bash\nThe job 'job-0' has been submitted successfully\nYou can run `runai describe job job-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0007] Job started\nConnecting to pod job-0-0-0\nIf you don't see a command prompt, try pressing enter.\nI have no name!@job-0-0-0:/$ </code></pre> <p>Adding home directories to an image per user is not a viable solution. To overcome this, Run:ai provides an additional flag <code>--create-home-dir</code>. Adding this flag creates a temporary home directory for the user within the container.  </p> <p>Notes</p> <ul> <li>Data saved in this directory will not be saved when the container exits. </li> <li>This flag is set by default to true when the <code>--run-as-user</code> flag is used, and false if not.</li> </ul>"},{"location":"admin/runai-setup/config/overview/","title":"Run:ai Configuration Articles","text":"<p>This section provides a list of installation-related articles dealing with a wide range of subjects:</p> Article Purpose Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster. Setup Project-based Researcher Access Control Enable  Run:ai access control is at the Project level. Single sign-on Integrate with the organization's Identity Provider to provide single sign-on for Run:ai Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai External access to Containers Understand the available options for Researchers to access containers from the outside User Identity in Container The identity of the user in the container determines its access to cluster resources. The document explains multiple way on how to propagate the user identity into the container. Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc."},{"location":"admin/runai-setup/maintenance/audit-log/","title":"Audit Log","text":""},{"location":"admin/runai-setup/maintenance/audit-log/#introduction","title":"Introduction","text":"<p>The Run:ai control plane provides audit log API and audit log user interface table. Both reflect the same information:</p> <ul> <li>All changes to business objects</li> <li>All logins to the control plane.</li> </ul>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-audit-log-user-interface","title":"Event History - Audit Log User Interface","text":"<p>The Administrators of the system can view the audit log using the user interface. The audit log screen is under the 'Event History' section:</p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-audit-log-information-fields","title":"Event History (audit log) information fields","text":"<p>The Administrator can choose what information fields to view within the audit log table, this is done by clicking the 'Columns' button and checking the required fields to be presented:</p> <p></p> <p></p> <p>Here's the list of available information fields in the Event History (audit log) table:</p> Field Type Description User/App user id The identity of the User or Application that executed this operation. Data &amp; Time date The exact timestamp at which the event occured.  Format <code>dd/mm/yyyy</code> for date and <code>hh:mm am/pm</code> for time. Event event type The type of the logged operation. Possible values: <code>Create</code>, <code>Update</code>, <code>Delete</code>, <code>Login</code>. Event ID integer Sequanicialy incrmental number of the logged operation, lower number means older event, higher means newer event. Status string The outcome of the logged operation. Possible values: <code>Succeeded</code>, <code>Failed</code>. Entity type string The type of the logged business object. Possible values: <code>Project</code>, <code>Department</code>, <code>User</code>, <code>Group</code>, <code>Login</code>, <code>Settings</code>, <code>Applications</code>, <code>Node Pool</code>. Entity name string The name of logged business object. Entity ID string The system's internal id of the logged business object. Cluster Name string The name of the cluster that the loged operation relates to. If the operation is not cluster specific - cluster name remains empty. Cluster ID string The system internal identifier of the cluster that the loged operation relates to. If the operation is not cluster specific - cluster id remains empty."},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-date-selector","title":"Event History - Date Selector","text":"<p>The Event History table saves logged operations for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period. To view older logged operations, or if you wish to refine your search and get more specific results or fewer results, you should use the time selector and change the period you search for. You can also refine your search by using filters as explained below.  </p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-filters","title":"Event History - Filters","text":"<p>The administrator can choose to filter the table using a list of predefined filters. The filter's value is a free text keyword entered by the administrator and must be fully matched to the requested field's actual value, otherwise, the filter will not find the requested keyword. Multiple filters can be set in parallel.</p> <p></p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-download-the-audit-log-file","title":"Event History - Download the Audit Log file","text":"<p>The event history table allows you to download the logged information in text form formatted as CSV or JSON files. The scope of the downloaded information is set by the scope of the table filters, i.e. if no filters or date selectors are used, the downloaded file includes the full scope of the information that the table holds - i.e. up to 30 days of logged information. To view older logged information (up to 90 days older, but no more than 30 days at a time), shorter periods, or narrower (filtered) scopes - use the date selector and filters.</p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#audit-log-api","title":"Audit log API","text":"<p>Since the amount of data is not trivial, the API is based on paging in the sense that it will retrieve a specified number of items for each API call. You can get more data by using subsequent calls. </p>"},{"location":"admin/runai-setup/maintenance/audit-log/#retrieve-audit-log-data-via-api","title":"Retrieve Audit Log data via API","text":"<p>To retrieve the Audit log you need to call an API. You can do this via code or by using the Audit function via a user interface for calling APIs.</p>"},{"location":"admin/runai-setup/maintenance/audit-log/#retrieve-via-code","title":"Retrieve via Code","text":"<p>Create an Application and generate a bearer token by following the API Authentication document.  </p> <p>To get the first 40 records of the audit log starting January 1st, 2022, run:</p> <pre><code>curl -X 'GET' \\\n'https://&lt;COMPANY-URL&gt;/v1/k8s/audit?start=2022-1-1' \\  # (1)\n-H 'accept: application/json' \\\n-H 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' # (2)\n</code></pre> <ol> <li><code>&lt;COMPANY-URL&gt;</code> is <code>app.run.ai</code> for SaaS installations (not <code>&lt;company&gt;.run.ai</code>) or the Run:ai user interface URL for Self-hosted installations.</li> <li>To obtain a Bearer token see API authentication.</li> </ol> <p>Sample result:</p> <pre><code>[\n{\n\"id\": 3,\n\"tenantId\": 1,\n\"happenedAt\": \"2022-07-07T09:45:32.069Z\",\n\"action\": \"Update\",\n\"version\": \"1.0\",\n\"entityId\": \"1\",\n\"entityType\": \"Project\",\n\"entityName\": \"team-a\",\n\"sourceType\": \"User\",\n\"sourceId\": \"a79500fb-c452-471f-adc0-b65c972bd5c2\",\n\"sourceName\": \"test@run.ai\",\n\"context\": {\n\"user_agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36\",\n\"ip_address\": \"10.244.0.0\"\n}\n},\n{\n\"id\": 2,\n\"tenantId\": 1,\n\"happenedAt\": \"2022-07-07T08:27:39.649Z\",\n\"action\": \"Create\",\n\"version\": \"1.0\",\n\"entityId\": \"fdc90aab-b183-4856-8337-14039063b876\",\n\"entityType\": \"App\",\n\"entityName\": \"admin\",\n\"sourceType\": \"User\",\n\"sourceId\": \"a79500fb-c452-471f-adc0-b65c972bd5c2\",\n\"sourceName\": \"test@run.ai\",\n\"context\": {\n\"user_agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36\",\n\"ip_address\": \"10.244.0.0\"\n}\n},\n...\n]\n</code></pre>"},{"location":"admin/runai-setup/maintenance/audit-log/#paging","title":"Paging","text":"<p>Use the <code>limit</code> and <code>offset</code> properties to retrieve all audit log entries.</p>"},{"location":"admin/runai-setup/maintenance/audit-log/#additional-filter","title":"Additional filter","text":"<p>You can add additional filters to the query as follows:</p> Field Type Description start date Start date for audit logs retrieval.  Format <code>yyyy-MM-dd</code> for date or <code>yyyy-MM-ddThh:mm:ss</code> for date-time. end date End date for audit logs retrieval.  Format <code>yyyy-MM-dd</code> for date or <code>yyyy-MM-ddThh:mm:ss</code> for date-time. action string The action of the logged operation. Possible values: <code>Create</code>, <code>Update</code>, <code>Delete</code>, <code>Login</code> source_type string The initiator of the action (user or machine to machine key). Possible values: <code>User</code>, <code>Application</code> source_id string The id of the source of the action. For <code>User</code>, this is the internal user id. For an <code>Application</code>, this is the internal id of the Application source_name string The name of the source of the action. For a <code>User</code>, this is the user's email, for an <code>Application</code>, this is the Application name. entity_type string The type of business object. Possible values: <code>Project</code>, <code>Department</code>, <code>User</code>, <code>Group</code>, <code>Login</code>, <code>Settings</code>, <code>Applications</code> entity_id string The id of the business object limit integer Paging: the number of records to fetch at once (default is 40 record) offset integer Paging: The offset from which to start fetching records. success string enter true for successful audit log records and false for failures (default is all records) download string enter true to download the logs into a file <p></p>"},{"location":"admin/runai-setup/maintenance/monitoring/","title":"Cluster Monitoring","text":""},{"location":"admin/runai-setup/maintenance/monitoring/#introduction","title":"Introduction","text":"<p>Organizations typically want to automatically highlight critical issues and escalate issues to IT/DevOps personnel. The standard practice is to install an alert management tool and connect it to critical systems. </p> <p>Run:ai is comprised of two parts:</p> <ul> <li>A control plane part, typically resides in the cloud. The health of the cloud portion of Run:ai can be viewed at status.run.ai. In Self-hosted installations of Run:ai is installed on-prem.</li> <li>One or more GPU Clusters. </li> </ul> <p>The purpose of this document is to configure the Run:ai to emit health alerts and to connect these alerts to alert-management systems within the organization. </p> <p>Alerts are emitted for Run:ai Clusters as well as the Run:ai control plane on Self-hosted installation where the control plane resides on the same Kubernetes cluster as one of the Run:ai clusters. </p>"},{"location":"admin/runai-setup/maintenance/monitoring/#alert-infrastructure","title":"Alert Infrastructure","text":"<p>Run:ai uses Prometheus for externalizing metrics. The Run:ai cluster installation installs Prometheus or can connect to an existing Prometheus instance used in the organization.  Run:ai cluster alerts are based on the Prometheus Alert Manager. The Prometheus Alert Manager is enabled by default.  </p> <p>This document explains, how to:</p> <ul> <li>Configure alert destinations. Triggered alerts will send data to destinations.  </li> <li>Understand the out-of-the-box cluster alerts. </li> <li>Advanced: add additional custom alerts. </li> </ul>"},{"location":"admin/runai-setup/maintenance/monitoring/#configure-alert-destinations","title":"Configure Alert Destinations","text":"<p>Prometheus Alert Manager provides a structured way to connect to alert-management systems. Configuration details are here. There are built-in plugins for popular systems such as PagerDuty and OpsGenie, including a generic webhook. </p> <p>Following is an example showing how to integrate Run:ai to a webhook:</p> <ul> <li>Use https://webhook.site/. Get the <code>Unique URL</code>.</li> <li>When installing the Run:ai cluster, edit the values file to add the following.</li> </ul> <pre><code>kube-prometheus-stack:\n...\nalertmanager:\nenabled: true\nconfig:\nglobal:\nresolve_timeout: 5m\nreceivers:\n- name: \"null\"\n- name: webhook-notifications\nwebhook_configs:\n- url: &lt;WEB-HOOK-URL&gt;\nsend_resolved: true\nroute:\ngroup_by:\n- alertname\ngroup_interval: 5m\ngroup_wait: 30s\nreceiver: 'null'\nrepeat_interval: 10m\nroutes:\n- receiver: webhook-notifications\n</code></pre> <p>(Replace <code>&lt;WEB-HOOK-URL&gt;</code> with the URL above).</p> <ul> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>Verify that you have received alerts at https://webhook.site/.</li> </ul>"},{"location":"admin/runai-setup/maintenance/monitoring/#out-of-the-box-alerts","title":"Out-of-the-box Alerts","text":"<p>A Run:ai cluster comes with several built-in alerts. Each alert tests a specific aspect of the Run:ai functionality. In addition, there is a single, inclusive alert, which aggregates all component-based alerts into a single cluster health test.</p> <p>The aggregated alert is named <code>RunaiCriticalProblem</code>. It is categorized as \"critical\".</p>"},{"location":"admin/runai-setup/maintenance/monitoring/#add-a-custom-alert","title":"Add a custom alert","text":"<p>You can add additional alerts from Run:ai. Alerts are triggered by using the Promtheus query language with any Run:ai metric. To add new alert:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>Add an alert according to the structure specified below.</li> </ul> <p>Add more alerts with the following structure:</p> <pre><code>kube-prometheus-stack:\nadditionalPrometheusRulesMap:\ncustom-runai:\ngroups:\n- name: custom-runai-rules\nrules:\n- alert: &lt;ALERT-NAME&gt;\nannotations:\nsummary: &lt;ALERT-SUMMARY-TEXT&gt;\nexpr:  &lt;PROMQL-EXPRESSION&gt;\nfor: &lt;optional: duration s/m/h&gt;\nlabels:\nseverity: &lt;critical/warning&gt;\n</code></pre> <p>You can find an example in the Prometheus documentation here.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/","title":"Planned and Unplanned Node Downtime","text":""},{"location":"admin/runai-setup/maintenance/node-downtime/#introduction","title":"Introduction","text":"<p>Nodes (Machines) that are part of the cluster are susceptible to occasional downtime. This can be either as part of planned maintenance where we bring down the node for a specified time in an orderly fashion or an unplanned downtime where the machine abruptly stops due to a software or hardware issue.</p> <p>The purpose of this document is to provide a process for retaining the Run:ai service and Researcher workloads during and after the downtime. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#self-hosted-installation","title":"Self-hosted installation","text":"<p>The self-hosted installation differs from the Classic (SaaS) installation of Run:ai in that it includes the Run:ai control-plane. The control plane contains data that must be preserved during downtime. As such, you must first follow the disaster recovery planning process. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#node-types","title":"Node Types","text":"<p>The document differentiates between Run:ai System Worker Nodes and GPU Worker Nodes:</p> <ul> <li>Worker Nodes - are where Machine Learning workloads run. </li> <li>Run:ai System Nodes - In a production installation Run:ai software runs on one or more Run:ai System Nodes on which the Run:ai software runs. </li> </ul>"},{"location":"admin/runai-setup/maintenance/node-downtime/#worker-nodes","title":"Worker Nodes","text":"<p>Worker Nodes are where machine learning workloads run. Ideally, when a node is down, whether for planned maintenance or due to an abrupt downtime, these workloads should migrate to other available nodes or wait in the queue to be started when possible. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#training-vs-interactive","title":"Training vs. Interactive","text":"<p>Run:ai differentiates between Training and Interactive workloads. The key difference at node downtime is that Training workloads will automatically move to a new node while Interactive workloads require a manual process. The manual process is recommended for Training workloads as well, as it hastens the process -- it takes time for Kubernetes to identify that a node is down.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#planned-maintenance","title":"Planned Maintenance","text":"<p>Before stopping a Worker node, perform the following: </p> <ul> <li>Stop the Kubernetes scheduler from starting new workloads on the node and drain node from all existing workloads. Workloads will move to other nodes or await on queue for renewed execution:</li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute\n</code></pre> <ul> <li> <p>Shut down the node and perform the required maintenance. </p> </li> <li> <p>When done, start the node and then run:</p> </li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute-\n</code></pre>"},{"location":"admin/runai-setup/maintenance/node-downtime/#unplanned-downtime","title":"Unplanned Downtime","text":"<ul> <li> <p>If a node has failed and has immediately restarted, all services will automatically start. </p> </li> <li> <p>If a node is to remain down for some time, you will want to drain the node so that workloads will migrate to another node:</p> </li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute\n</code></pre> <p>When the node is up again, run: </p> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute-\n</code></pre> <ul> <li>If the node is to be permanently shut down, you can remove it completely from Kubernetes. Run:</li> </ul> <pre><code>kubectl delete node &lt;node-name&gt;\n</code></pre> <p>However, if you plan to bring back the node, you will need to rejoin the node into the cluster. See Rejoin.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#runai-system-nodes","title":"Run:ai System Nodes","text":"<p>In a production installation, Run:ai software runs on one or more Run:ai system nodes. As a best practice, it's best to have more than one such node so that during planned maintenance or unplanned downtime of a single node, the other node will take over. If a second node does not exist, you will have to designate an arbitrary node on the cluster as a Run:ai system node to complete the process below.</p> <p>Protocols for planned maintenance and unplanned downtime are identical to Worker Nodes. See the section above. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#rejoin-a-node-into-the-kubernetes-cluster","title":"Rejoin a Node into the Kubernetes Cluster","text":"<p>To rejoin a node to the cluster follow the following steps:</p> <ul> <li>On the master node, run:</li> </ul> <p><pre><code>kubeadm token create --print-join-command\n</code></pre> * This would output a <code>kubeadm join</code> command. Run the command on the worker node for it to re-join the Kubernetes cluster.  * Verify that the node is joined by running:</p> <pre><code>kubectl get nodes\n</code></pre> <ul> <li>When the machine is up you will need to re-label nodes according to their role</li> </ul>"},{"location":"admin/runai-setup/self-hosted/overview/","title":"Self Hosted Run:ai Installation","text":"<p>The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.</p> <p>Run:ai self-hosting comes with two variants:</p> Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet  <p>The self-hosted installation is priced differently. For further information please talk to Run:ai sales. </p>"},{"location":"admin/runai-setup/self-hosted/overview/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"<p>Kubernetes has many Certified Kubernetes Providers. Run:ai has been installed with a number of those such as Rancher, OpenShift, HPE Ezmeral, and Native Kubernetes. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:</p> <ul> <li>OpenShift-based installation. See Run:ai OpenShift installation. The Run:ai operator for OpenShift is certified by Red Hat.</li> <li>Kubernetes-based installation. See Run:ai Kubernetes installation.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/","title":"Installing additional Clusters","text":"<p>The first Run:ai cluster is typically installed on the same Kubernetes cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different Kubernetes clusters.</p> <p>The instructions are for Run:ai version 2.8 and up.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/#installation","title":"Installation","text":"<p>Follow the Run:ai SaaS installation network instructions as described here.  Specifically:</p> <ol> <li>Install Run:ai prerequisites. Including ingress controller and Prometheus. </li> <li>The Cluster should have a dedicated URL with a trusted certificate.</li> <li>Create a secret in the Run:ai namespace containing the details of a trusted certificate.  </li> </ol> <p>Create a new cluster and download a values file. Perform the following changes in the file:</p> <ul> <li>Under: <code>runai-operator.config.global</code> set <code>clusterDomain</code> to the domain name of the new cluster.</li> <li>Under <code>runai-operator.config.researcher-service</code> set <code>ingress</code> to <code>true</code>.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#domain-certificate","title":"Domain certificate","text":"<p>You must provide the domain's private key and crt as a Kubernetes secret in the <code>runai-backend</code> namespace. Run: </p> <pre><code>kubectl create secret tls runai-backend-tls -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane","title":"Install the Control Plane","text":"<p>Run the helm command below:</p> ConnectedAirgapped <pre><code>helm repo add runai-backend https://backend-charts.storage.googleapis.com\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\\n--set global.domain=&lt;DOMAIN&gt;  # (1)\n</code></pre> <ol> <li>Domain name described here. </li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-backend</code>.</p> <pre><code>helm upgrade -i runai-backend control-plane-&lt;VERSION&gt;.tgz  \\ # (1)\n--set global.domain=&lt;DOMAIN&gt;  # (2)\n-n runai-backend -f custom-env.yaml  # (3)\n</code></pre> <ol> <li>Replace <code>&lt;VERSION&gt;</code> with the Run:ai control plane version.</li> <li>Domain name described here. </li> <li><code>custom-env.yaml</code> should have been created by the prepare installation script in the previous section. </li> </ol> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#optional-additional-configurations","title":"(Optional) Additional Configurations","text":"<p>There may be cases where you need to set additional properties as follows:</p> Key Change Description <code>keycloakx.adminUser</code> User name of the internal identity provider administrator This user is the administrator of Keycloak <code>keycloakx.adminPassword</code> Password of the internal identity provider administrator This password is for the administrator of Keycloak <code>global.ingress.ingressClass</code> Ingress class Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai <code>global.ingress.tlsSecretName</code> TLS secret name Run:ai requires the creation of a secret with domain certificate. See above. If the <code>runai-backend</code> namespace already had such a secret, you can set the secret name here <code>global.postgresql.auth.username</code> PostgreSQL username Override the Run:ai default user name for the Run:ai database <code>global.postgresql.auth.password</code> PostgreSQL password Override the Run:ai default password for the Run:ai database <code>grafana.adminUser</code> Grafana username Override the Run:ai default user name for accessing Grafana <code>grafana.adminPassword</code> Grafana password Override the Run:ai default password for accessing Grafana <code>global.imagePullSecrets:</code> <code>- name: &lt;secret-name&gt;</code> Docker secret Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments <code>&lt;component&gt;</code> <code>resources:</code> <code>limits:</code> <code>cpu: 500m</code> <code>memory: 512Mi</code> <code>requests:</code> <code>cpu: 250m</code> <code>memory: 256Mi</code> Pod request and limits <code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code> <p>Use the <code>--set</code> syntax in the helm command above.  </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User Interface","text":"<p>Go to: <code>runai.&lt;company-name&gt;</code>. Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. Go to the Users area and change the password. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#optional-enable-forgot-password","title":"(Optional) Enable \"Forgot password\"","text":"<p>To support the \u201cForgot password\u201d functionality, follow the steps below.</p> <ul> <li>Go to <code>runai.&lt;company-name&gt;/auth</code> and Log in. </li> <li>Under <code>Realm settings</code>, select the <code>Login</code> tab and enable the <code>Forgot password</code> feature.</li> <li>Under the <code>Email</code> tab, define an SMTP server, as explained here</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#next-steps","title":"Next Steps","text":"<p>Continue with installing a Run:ai Cluster.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/","title":"Self Hosted installation over Kubernetes - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#prerequisites","title":"Prerequisites","text":"<p>Install prerequisites as per cluster prerequisites document.  </p>"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#customize-installation","title":"Customize Installation","text":"<ul> <li>Perform the cluster installation instructions explained here. </li> <li>(Optional) make the following changes to the configuration file you have downloaded:</li> </ul> Key Default Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create namespaces, or would want to create namespaces manually rather than use the Run:ai convention of <code>runai-&lt;PROJECT-NAME&gt;</code>. When set to <code>false</code>, will require an additional manual step when creating new Run:ai Projects. <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create Kubernetes Secrets. When not enabled, automatic secret propagation will not be available <code>runai-operator.config.mps-server.enabled</code> <code>false</code> Allow the use of NVIDIA MPS. MPS is useful with Inference workloads. Requires extra cluster permissions  <code>runai-operator.config.runai-container-toolkit.enabled</code> <code>true</code> Controls the usage of Fractions. Requires extra cluster permissions  <code>runai-operator.config.global.runtime</code> <code>docker</code> Defines the container runtime of the cluster (supports <code>docker</code> and <code>containerd</code>). Set to <code>containerd</code> when using Tanzu <code>runai-operator.config.runaiBackend.password</code> Default password already set admin@run.ai password. Need to change only if you have changed the password here"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#install-cluster","title":"Install Cluster","text":"<p>Run:</p> ConnectedAirgapped <pre><code>helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\n\nhelm install runai-cluster runai/runai-cluster -n runai \\\n    -f runai-&lt;cluster-name&gt;.yaml --create-namespace\n</code></pre> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p> <pre><code>helm install runai-cluster -n runai  \\ \n  runai-cluster-&lt;version&gt;.tgz -f runai-&lt;cluster-name&gt;.yaml --create-namespace\n</code></pre> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. For more details see Understanding cluster access roles.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/next-steps/","title":"Next Steps","text":"<ul> <li>Create additional I Users.</li> <li>Set up Project-based Researcher Access Control.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenace scenarios.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/","title":"Self-Hosted Installation over Kubernetes - Preparations","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prerequisites","title":"Prerequisites","text":"<p>See the Prerequisites section above.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prepare-installation-artifacts","title":"Prepare Installation Artifacts","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#runai-software-files","title":"Run:ai Software Files","text":"<p>SSH into a node with <code>kubectl</code> access to the cluster and <code>Docker</code> installed.</p> ConnectedAirgapped <p>Run the following to enable image download from the Run:ai Container Registry on Google cloud:</p> <pre><code>kubectl create namespace runai-backend\nkubectl apply -f runai-gcr-secret.yaml\n</code></pre> <p>To extract Run:ai files, replace <code>&lt;VERSION&gt;</code> in the command below and run: </p> <pre><code>tar xvf runai-air-gapped-&lt;version&gt;.tar.gz\ncd deploy\n\nkubectl create namespace runai-backend\n</code></pre> <p>Upload images</p> <p>Upload images to a local Docker Registry. Set the Docker Registry address in the form of <code>NAME:PORT</code> (do not add <code>https</code>):</p> <pre><code>export REGISTRY_URL=&lt;Docker Registry address&gt;\n</code></pre> <p>Run the following script (you must dockerd installed and at least 20GB of free disk space to run): </p> <pre><code>sudo -E ./prepare_installation.sh\n</code></pre> <p>If Docker is configured to run as non-root then <code>sudo</code> is not required.</p> <p>The script should create a file named <code>custom-env.yaml</code> which will be used by the control-plane installation.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#optional-mark-runai-system-workers","title":"(Optional) Mark Run:ai System Workers","text":"<p>You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.  </p> <p>To set system worker nodes run:</p> <pre><code>kubectl label node &lt;NODE-NAME&gt; node-role.kubernetes.io/runai-system=true\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a <code>runai-system</code> node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#additional-permissions","title":"Additional Permissions","text":"<p>As part of the installation, you will be required to install the Run:ai Control Plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the <code>--dry-run</code> on both helm charts. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#next-steps","title":"Next Steps","text":"<p>Continue with installing the Run:ai Control Plane.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/","title":"Prerequisites","text":"<p>Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#control-plane-and-clusters","title":"Control-plane and clusters","text":"<p>As part of the installation process you will install:</p> <ul> <li>A control-plane managing cluster</li> <li>One or more Run:ai clusters</li> </ul> <p>Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>See Cluster prerequisites hardware requirements.</p> <p>In addition, the control plane installation of Run:ai requires the configuration of Kubernetes Persistent Volumes of a total size of 110GB. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software","title":"Run:ai Software","text":"ConnectedAirgapped <p>You should receive a file: <code>runai-gcr-secret.yaml</code> from Run:ai Customer Support. The file provides access to the Run:ai Container registry.</p> <p>You should receive a single file <code>runai-air-gapped-&lt;version&gt;.tar.gz</code> from Run:ai customer support</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software-prerequisites","title":"Run:ai Software Prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#operating-system","title":"Operating System","text":"<p>See Run:ai Cluster prerequisites operating system requirements.</p> <p>The Run:ai control plane operating system prerequisites are identical.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#kubernetes","title":"Kubernetes","text":"<p>See Run:ai Cluster prerequisites Kubernetes requirements.</p> <p>The Run:ai control plane operating system prerequisites are identical.</p> <p>The Run:ai control-plane requires a default storage class to create persistent volume claims for Run:ai storage. The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the Run:ai persistent data is saved or deleted when the Run:ai control plane is deleted.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#nvidia-prerequisites","title":"NVIDIA Prerequisites","text":"<p>See Run:ai Cluster prerequisites NVIDIA requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#prometheus-prerequisites","title":"Prometheus Prerequisites","text":"<p>See Run:ai Cluster prerequisites Prometheus requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Prometheus prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#optional-inference-prerequisites","title":"(Optional) Inference Prerequisites","text":"<p>See Run:ai Cluster prerequisites Inference requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#helm","title":"Helm","text":"<p>Run:ai requires Helm. To install Helm, see https://helm.sh/docs/intro/install/. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#network-requirements","title":"Network Requirements","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#ingress-controller","title":"Ingress Controller","text":"<p>The Run:ai control plane installation assumes an existing installation of NGINX as the ingress controller. You can follow the Run:ai Cluster prerequisites ingress controller installation.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#domain-name","title":"Domain name","text":"<p>The Run:ai control plane requires a domain name (FQDN). You must supply a domain name as well as a trusted certificate for that domain. </p> <ul> <li>When installing the first Run:ai cluster on the same Kubernetes cluster as the control plane, the Run:ai cluster URL will be the same as the control-plane URL.</li> <li>When installing the Run:ai cluster on a separate Kubernetes cluster, follow the Run:ai domain name requirements. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#installer-machine","title":"Installer Machine","text":"<p>The machine running the installation script (typically the Kubernetes master) must have:</p> <ul> <li>At least 50GB of free space.</li> <li>Docker installed.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#other","title":"Other","text":"<ul> <li>(Airgapped installation only)  Private Docker Registry. Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <code>&lt;REGISTRY_URL&gt;</code>). </li> <li>(Optional) SAML Integration as described under single sign-on. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt; --domain &lt;dns-entry&gt;\n</code></pre> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/project-management/","title":"Self Hosted installation over Kubernetes - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#introduction","title":"Introduction","text":"<p>The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:</p> <ol> <li>Creates a namespace by the name of <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Labels the namespace as managed by Run:ai.</li> <li>Provides access to the namespace for Run:ai services.</li> <li>Associates users with the namespace. </li> </ol> <p>This process may need to be altered if,</p> <ul> <li>Researchers already have existing Kubernetes namespaces</li> <li>The organization's Kubernetes namespace naming convention does not allow the <code>runai-</code> prefix. </li> <li>The organization's policy does not allow the automatic creation of namespaces.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#process","title":"Process","text":"<p>Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:</p> <ul> <li>When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag <code>createNamespaces</code> to <code>false</code>.</li> <li>Using the Run:ai User Interface, create a new Project <code>&lt;PROJECT-NAME&gt;</code>. A namespace will not be created. </li> <li>Associate and existing namepace <code>&lt;NAMESPACE&gt;</code> with the Run:ai project by running:</li> </ul> <pre><code>kubectl label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;\n</code></pre> <p>Caution</p> <p>Setting the <code>createNamespaces</code> flag to <code>false</code> moves the responsibility of creating namespaces to match Run:ai Projects to the administrator. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/","title":"Uninstall Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-a-runai-cluster","title":"Uninstall a Run:ai Cluster","text":"<p>To uninstall the cluster see: cluster delete </p>"},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-the-runai-control-plane","title":"Uninstall the Run:ai Control Plane","text":"<p>To delete the control plane, run:</p> <pre><code>helm delete runai-backend -n runai-backend\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/","title":"Upgrade Run:ai","text":"<p>Important</p> <p>Run:ai data is stored in Kubernetes persistent volumes (PVs). Prior to Run:ai 2.12, PVs are owned by the Run:ai installation. Thus, uninstalling the <code>runai-backend</code> helm chart may delete all of your data. </p> <p>From version 2.12 forward, PVs are owned the customer and are independent of the Run:ai installation. As such, they are subject to storage class reclaim policy.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#preparations","title":"Preparations","text":"ConnectedAirgapped <p>No preparation required.</p> <ul> <li>Ask for a tar file <code>runai-air-gapped-&lt;new-version&gt;.tar</code> from Run:ai customer support. The file contains the new version you want to upgrade to. <code>new-version</code> is the updated version of the Run:ai control plane.</li> <li>Upload the images as described here.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#specific-version-instructions","title":"Specific version instructions","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-27-or-28","title":"Upgrade from Version 2.7 or 2.8","text":"<p>Before upgrading the control plane, run: </p> <pre><code>POSTGRES_PV=$(kubectl get pvc pvc-postgresql -n runai-backend -o jsonpath='{.spec.volumeName}')\nTHANOS_PV=$(kubectl get pvc pvc-thanos-receive -n runai-backend -o jsonpath='{.spec.volumeName}')\nkubectl patch pv $POSTGRES_PV $THANOS_PV -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Retain\"}}'\nkubectl delete secret -n runai-backend runai-backend-postgresql\nkubectl delete sts -n runai-backend keycloak runai-backend-postgresql\n</code></pre> <p>Before version 2.9, the Run:ai installation, by default, included NGINX. It was possible to disable this installation. If NGINX is enabled in your current installation, as per the default, run the following 2 lines:</p> <p><pre><code>kubectl delete ValidatingWebhookConfiguration runai-backend-nginx-ingress-admission\nkubectl delete ingressclass nginx </code></pre> (If Run:ai configuration has previously disabled NGINX installation then these lines should not be run).</p> <p>Next, install NGINX as described here</p> <p>Then create a TLS secret and upgrade the control plane as described in the control plane installation. Before upgrading, find customizations and merge them as discussed below. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29-210-211","title":"Upgrade from version 2.9, 2.10, 2.11","text":"<p>Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#pvc-ownership","title":"PVC Ownership","text":"<p>Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, </p> <ul> <li>Run:ai requires a Kubernetes storage class to be installed.</li> <li>The PVCs are created by the Kubernetes StatefulSets. </li> </ul> <p>The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.  </p> <p>To remove the ownership in an older installation, run:</p> <pre><code>kubectl patch pvc -n runai-backend pvc-thanos-receive  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\nkubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#ingress","title":"Ingress","text":"<p>Delete the ingress object which will be recreated by the control plane upgrade</p> <pre><code>kubectl delete ing -n runai-backend runai-backend-ingress\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#installation-customization","title":"Installation Customization","text":"<p>The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard <code>--set</code> flags. If you have previously customized the installation, you must now extract these customizations and add them as <code>--set</code> flag to the helm installation:</p> <ul> <li>Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here <code>https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh</code>. For information on how to use this utility please contact Run:ai customer support. </li> <li>Search for the customizations you found in the optional configurations table and add them in the new format. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":"<ul> <li>Create a <code>tls secret</code> as described in the control plane installation. </li> <li>Upgrade the control plane as described in the control plane installation. During the upgrade, you must tell the installation not to create the two PVCs:</li> </ul> <pre><code>helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\\n    --set global.domain=&lt;DOMAIN&gt; \\\n    --set=postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n    --set=thanos.receive.persistence.existingClaim=pvc-thanos-receive \n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"ConnectedAirgapped <p>To upgrade the cluster follow the instructions here.</p> <p><pre><code>helm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster -n runai runai-cluster-&lt;version&gt;.tgz -f values.yaml\n</code></pre> (replace <code>&lt;version&gt;</code> with the cluster version)</p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/","title":"Installing additional Clusters","text":"<p>The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.</p> <p>The instructions are for Run:ai version 2.13 and up.</p> <p>Limitation</p> <p>When you log in, you do so in the context of a specific cluster. When you switch to a different cluster, you will be prompted to log in to that cluster. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#configuration","title":"Configuration","text":"<p>The exact configuration details must be worked together with Run:ai customer support. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#additional-cluster-installation","title":"Additional Cluster Installation","text":"<p>Create a new cluster, then:</p> <ul> <li>Select a target platform <code>OpenShift</code> </li> <li>Select a Cluster location <code>Remote to Control Plane</code>.</li> <li>You must enter a specific cluster URL with the format <code>https://runai.apps.&lt;BASE_DOMAIN&gt;</code>. To get the base Domain run <code>oc get dns cluster -oyaml | grep baseDomain</code></li> <li>Ignore the instructions for creating a secret.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#login","title":"Login","text":"<p>When configured, you will see an option to choose a cluster at the bottom of the login screen:</p> <p></p>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#install-the-control-plane","title":"Install the Control Plane","text":"<p>Run the helm command below:</p> ConnectedAirgapped <pre><code>helm repo add runai-backend https://backend-charts.storage.googleapis.com\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\ \n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ # (1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt;  # (2)\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-backend</code>.</p> <pre><code>helm upgrade -i runai-backend  ./runai-backend-&lt;version&gt;.tgz -n runai-backend \\ \n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ # (1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt;  # (2)\n</code></pre> <ol> <li>The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run <code>oc get routes -A</code></li> <li>Name of the administrator user in the company directory.</li> </ol> <p>(replace <code>&lt;version&gt;</code> with the control plane version)</p> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#optional-additional-configurations","title":"(Optional) Additional Configurations","text":"<p>There may be cases where you need to set additional properties as follows:</p> Key Change Description <code>keycloakx.adminUser</code> User name of the internal identity provider administrator This user is the administrator of Keycloak <code>keycloakx.adminPassword</code> Password of the internal identity provider administrator This password is for the administrator of Keycloak <code>global.postgresql.auth.username</code> PostgreSQL username Override the Run:ai default user name for the Run:ai database <code>global.postgresql.auth.password</code> PostgreSQL password Override the Run:ai default password for the Run:ai database <code>grafana.adminUser</code> Grafana username Override the Run:ai default user name for accessing Grafana <code>grafana.adminPassword</code> Grafana password Override the Run:ai default password for accessing Grafana <code>global.imagePullSecrets:</code> <code>- name: &lt;secret-name&gt;</code> Docker secret Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments <code>&lt;component&gt;</code> <code>resources:</code> <code>limits:</code> <code>cpu: 500m</code> <code>memory: 512Mi</code> <code>requests:</code> <code>cpu: 250m</code> <code>memory: 256Mi</code> Pod request and limits <code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User Interface","text":"<ul> <li>Run: <code>oc get routes -n runai-backend</code> to find the Run:ai Administration User Interface URL. </li> <li>Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. </li> <li>Go to the Users area and change the password. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#next-steps","title":"Next Steps","text":"<p>Continue with installing a Run:ai Cluster.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/","title":"Self-Hosted installation over OpenShift - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#prerequisites","title":"Prerequisites","text":"<p>Note</p> <p>You must have Cluster Administrator rights to install these dependencies. </p> <p>Before installing Run:ai, you must install NVIDIA software on your OpenShift cluster to enable GPUs.  NVIDIA has provided detailed documentation.  Follow the instructions to install the two operators <code>Node Feature Discovery</code> and <code>NVIDIA GPU Operator</code> from the OpenShift web console. </p> <p>When done, verify that the GPU Operator is installed by running:</p> <pre><code>oc get pods -n nvidia-gpu-operator\n</code></pre> <p>(the GPU Operator namespace may differ in different operator versions).</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#create-openshift-projects","title":"Create OpenShift Projects","text":"<p>Run:ai cluster installation uses several namespaces (or projects in OpenShift terminology). Run the following:</p> <pre><code>oc new-project runai\noc new-project runai-reservation\noc new-project runai-scale-adjust\n</code></pre> <p>The last namespace (<code>runai-scale-adjust</code>) is only required if the cluster is a cloud cluster and is configured for auto-scaling. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#cluster-installation","title":"Cluster Installation","text":"ConnectedAirgapped <p>Perform the cluster installation instructions explained here. When creating a new cluster, select the OpenShift  target platform.</p> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p> <p>Perform the cluster installation instructions explained here. When creating a new cluster, select the OpenShift  target platform. The <code>helm</code> install command should use the <code>runai-cluster</code> tar file: <pre><code>helm install runai-cluster -n runai  \\ \n  runai-cluster-&lt;version&gt;.tgz -f runai-&lt;cluster-name&gt;.yaml  \n</code></pre></p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-configuration","title":"Optional Configuration","text":"<p>Make the following changes to the configuration file you have downloaded:</p> Key Change Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create namespaces, or would want to create namespaces manually rather than use the Run:ai convention of <code>runai-&lt;PROJECT-NAME&gt;</code>. When set to <code>false</code>, will require an additional manual step when creating new Run:ai Projects. <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create Kubernetes Secrets. When not enabled, automatic secret propagation will not be available <code>runai-operator.config.mps-server.enabled</code> Default is <code>false</code> Allow the use of NVIDIA MPS. MPS is useful with Inference workloads. Requires extra permissions <code>runai-operator.config.runai-container-toolkit.enabled</code> <code>true</code> Controls the usage of Fractions. Requires extra cluster permissions <code>runai-operator.config.runaiBackend.password</code> Default password already set admin@run.ai password. Need to change only if you have changed the password here <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. For more details see understanding cluster access roles.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-prometheus-adapter-for-inference","title":"(Optional) Prometheus Adapter for Inference","text":"<p>The Prometheus adapter is required if you are using Inference workloads and require a custom metric for autoscaling. The following additional steps are required for it to work:</p> <ol> <li>Copy <code>prometheus-adapter-prometheus-config</code> and <code>serving-certs-ca-bundle</code> ConfigMaps from <code>openshift-monitoring</code> namespace to the <code>monitoring</code> namespace <pre><code>kubectl get cm prometheus-adapter-prometheus-config --namespace=openshift-monitoring -o yaml \\\n  | sed 's/namespace: openshift-monitoring/namespace: monitoring/' \\\n  | kubectl create -f -\nkubectl get cm serving-certs-ca-bundle --namespace=openshift-monitoring -o yaml \\\n  | sed 's/namespace: openshift-monitoring/namespace: monitoring/' \\\n  | kubectl create -f -\n</code></pre></li> <li>Allow Prometheus Adapter <code>serviceaccount</code> to create a <code>SecurityContext</code> with RunAsUser 10001: <pre><code>oc adm policy add-scc-to-user anyuid system:serviceaccount:monitoring:runai-cluster-prometheus-adapter\n</code></pre></li> </ol>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#next-steps","title":"Next Steps","text":"<p>Continue to create Run:ai Projects.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/next-steps/","title":"Next Steps","text":"<ul> <li>Create additional Run:ai Users.</li> <li>Set up Project-based Researcher Access Control.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenace scenarios.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/","title":"Preparing for a Run:ai OpenShift Installation","text":"<p>The following section provides IT with the information needed to prepare for a Run:ai installation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#create-openshift-projects","title":"Create OpenShift Projects","text":"<p>The Run:ai control plane uses a namespace (or project in OpenShift terminology) name <code>runai-backend</code>. You must create it before installing:</p> <pre><code>oc new-project runai-backend\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#prepare-runai-installation-artifacts","title":"Prepare Run:ai Installation Artifacts","text":""},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#runai-software-files","title":"Run:ai Software Files","text":"<p>SSH into a node with <code>oc</code> access (<code>oc</code> is the OpenShift command line) to the cluster and <code>Docker</code> installed.</p> ConnectedAirgapped <p>Run the following to enable image download from the Run:ai Container Registry on Google cloud:</p> <pre><code>oc apply -f runai-gcr-secret.yaml -n runai-backend\n</code></pre> <p>To extract Run:ai files, replace <code>&lt;VERSION&gt;</code> in the command below and run: </p> <p><pre><code>tar xvf runai-&lt;version&gt;.tar.gz\ncd deploy\n</code></pre> Upload images</p> <p>Upload images to a local Docker Registry. Set the Docker Registry address in the form of <code>NAME:PORT</code> (do not add <code>https</code>):</p> <pre><code>export REGISTRY_URL=&lt;Docker Registry address&gt;\n</code></pre> <p>Run the following script (you must have at least 20GB of free disk space to run): </p> <pre><code>sudo -E ./prepare_installation.sh\n</code></pre> <p>(If docker is configured to run as non-root then <code>sudo</code> is not required).</p> <p>The script should create a file named custom-env.yaml which will be used by the control-plane installation.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#optional-mark-runai-system-workers","title":"(Optional) Mark Run:ai System Workers","text":"<p>You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.  </p> <p>To set system worker nodes run:</p> <pre><code>kubectl label node &lt;NODE-NAME&gt; node-role.kubernetes.io/runai-system=true\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a <code>runai-system</code> node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#additional-permissions","title":"Additional Permissions","text":"<p>As part of the installation, you will be required to install the Control plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the <code>--dry-run</code> on both helm charts. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#next-steps","title":"Next Steps","text":"<p>Continue with installing the Run:ai Control Plane.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/","title":"Self Hosted installation over OpenShift - Prerequisites","text":"<p>Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#control-plane-and-clusters","title":"Control-plane and clusters","text":"<p>As part of the installation process you will install:</p> <ul> <li>A control-plane managing cluster</li> <li>One or more clusters</li> </ul> <p>Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must. </p> <p>Important</p> <p>In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.  </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>See Cluster prerequisites hardware requirements.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software","title":"Run:ai Software","text":"ConnectedAirgapped <p>You should receive a file: <code>runai-gcr-secret.yaml</code> from Run:ai Customer Support. The file provides access to the Run:ai Container registry.</p> <p>You should receive a single file <code>runai-&lt;version&gt;.tar</code> from Run:ai customer support</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software-prerequisites","title":"Run:ai Software Prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#operating-system","title":"Operating System","text":"<p>OpenShift has specific operating system requirements that can be found in the RedHat documentation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#openshift","title":"OpenShift","text":"<p>Run:ai supports OpenShift. OpenShift Versions supported are detailed here.</p> <ul> <li>OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains. </li> <li>OpenShift must have a configured identity provider (Idp). </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#nvidia-prerequisites","title":"NVIDIA Prerequisites","text":"<p>See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.</p> <p>Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#optional-inference-prerequisites","title":"(Optional) Inference Prerequisites","text":"<p>See Run:ai Cluster prerequisites Inference requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#helm","title":"Helm","text":"<p>Run:ai requires Helm. To install Helm, see https://helm.sh/docs/intro/install/. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#installer-machine","title":"Installer Machine","text":"<p>The machine running the installation script (typically the Kubernetes master) must have:</p> <ul> <li>At least 50GB of free space.</li> <li>Docker installed.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#other","title":"Other","text":"<ul> <li>(Airgapped installation only) Private Docker Registry. Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <code>&lt;REGISTRY_URL&gt;</code>). </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyzes their relevancy to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt; \n</code></pre> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/project-management/","title":"Self Hosted installation over OpenShift - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#introduction","title":"Introduction","text":"<p>The Administrator creates Run:ai Projects via the Run:ai User Interface. When enabling Researcher Authentication you also assign users to Projects.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:</p> <ol> <li>Creates a namespace by the name of <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Labels the namespace as managed by Run:ai.</li> <li>Provides access to the namespace for Run:ai services.</li> <li>Associates users with the namespace. </li> </ol> <p>This process may need to be altered if, </p> <ul> <li>Researchers already have existing Kubernetes namespaces</li> <li>The organization's Kubernetes namespace naming convention does not allow the <code>runai-</code> prefix. </li> <li>The organization's policy does not allow the automatic creation of namespaces</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#process","title":"Process","text":"<p>Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:</p> <ul> <li>When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag <code>createNamespaces</code> to <code>false</code>.</li> <li>Using the Run:ai User Interface, create a new Project <code>&lt;PROJECT-NAME&gt;</code>. A namespace will not be created. </li> <li>Associate and existing namepace <code>&lt;NAMESPACE&gt;</code> with the Run:ai project by running:</li> </ul> <pre><code>oc label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;\n</code></pre> <p>Caution</p> <p>Setting the <code>createNamespaces</code> flag to <code>false</code> moves the responsibility of creating namespaces to match Run:ai Projects to the administrator. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/uninstall/","title":"Uninstall Run:ai","text":"<p>See uninstall section here</p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#preparations","title":"Preparations","text":"ConnectedAirgapped <p>No preparation required.</p> <ul> <li>Ask for a tar file <code>runai-air-gapped-&lt;new-version&gt;.tar</code> from Run:ai customer support. The file contains the new version you want to upgrade to. <code>new-version</code> is the updated version of the Run:ai control plane.</li> <li>Upload the images as described here.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-27-or-28","title":"Upgrade from Version 2.7 or 2.8.","text":"<p>Before upgrading the control plane, run: </p> <pre><code>POSTGRES_PV=$(kubectl get pvc pvc-postgresql -n runai-backend -o jsonpath='{.spec.volumeName}')\nkubectl patch pv $POSTGRES_PV -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Retain\"}}'\nkubectl delete secret -n runai-backend runai-backend-postgresql\nkubectl delete sts -n runai-backend keycloak runai-backend-postgresql\n</code></pre> <p>Then upgrade the control plane as described below. Before upgrading, find customizations and merge them as discussed below. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29-210-or-211","title":"Upgrade from Version 2.9, 2.10 or 2.11","text":"<p>Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#pvc-ownership","title":"PVC Ownership","text":"<p>Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, </p> <ul> <li>Run:ai requires a Kubernetes storage class to be installed.</li> <li>The PVCs are created by the Kubernetes StatefulSets. </li> </ul> <p>The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.  </p> <p>To remove the ownership in an older installation, run:</p> <pre><code>kubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#installation-customization","title":"Installation Customization","text":"<p>The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard <code>--set</code> flags. If you have previously customized the installation, you must now extract these customizations and add them as <code>--set</code> flag to the helm installation:</p> <ul> <li>Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here <code>https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh</code>. For information on how to use this utility please contact Run:ai customer support. </li> <li>Search for the customizations you found in the optional configurations table and add them in the new format.  </li> </ul> <p>Then upgrade the control plane as described below. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-the-control-plane","title":"Upgrade the Control Plane","text":"ConnectedAirgapped <pre><code>helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane  \\\n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt; \\ # (2)\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol> <pre><code>helm upgrade -i runai-backend  ./runai-backend-&lt;version&gt;.tgz -n runai-backend \\\n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt; \\ # (2)\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"ConnectedAirgapped <p>To upgrade the cluster follow the instructions here.</p> <p><pre><code>helm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster -n runai runai-cluster-&lt;version&gt;.tgz -f values.yaml\n</code></pre> (replace <code>&lt;version&gt;</code> with the cluster version)</p>"},{"location":"admin/troubleshooting/cluster-health-check/","title":"Verifying Cluster Health","text":"<p>Following is a set of tests that determine the Run:ai cluster health:</p>"},{"location":"admin/troubleshooting/cluster-health-check/#verify-that-data-is-sent-to-the-cloud","title":"Verify that data is sent to the cloud","text":"<p>Log in to <code>&lt;company-name&gt;.run.ai/dashboards/now</code>.</p> <ul> <li>Verify that all metrics in the overview dashboard are showing. </li> <li>Verify that all metrics are showing in the Nodes view. </li> <li>Go to Projects and create a new Project. Find the new Project using the CLI command: <code>runai list projects</code></li> </ul>"},{"location":"admin/troubleshooting/cluster-health-check/#verify-that-the-runai-services-are-running","title":"Verify that the Run:ai services are running","text":"<p>Run:</p> <p><pre><code>kubectl get pods -n runai\nkubectl get pods -n monitoring\n</code></pre> Verify that all pods are in <code>Running</code> status and a ready state (1/1 or similar)</p> <p>Run:</p> <pre><code>kubectl get deployments -n runai\n</code></pre> <p>Check that all deployments are in a ready state (1/1)</p> <p>Run:</p> <pre><code>kubectl get daemonset -n runai\n</code></pre> <p>A Daemonset runs on every node. Some of the Run:ai daemon-sets run on all nodes. Others run only on nodes that contain GPUs. Verify that for all daemonsets the desired number is equal to  current and to ready. </p>"},{"location":"admin/troubleshooting/cluster-health-check/#submit-a-job-via-the-command-line-interface","title":"Submit a Job via the command-line interface","text":"<p>Submitting a Job will allow you to verify that the Run:ai scheduling service is in order. </p> <ul> <li>Make sure that the Project you have created has a quota of at least 1 GPU</li> <li>Run:</li> </ul> <pre><code>runai config project &lt;project-name&gt;\nrunai submit -i gcr.io/run-ai-demo/quickstart -g 1\n</code></pre> <ul> <li>Verify that the Job is in a Running state by running: </li> </ul> <pre><code>runai list jobs\n</code></pre> <ul> <li>Verify that the Job is showing in the Jobs area at <code>&lt;company-name&gt;.run.ai/jobs</code>.</li> </ul>"},{"location":"admin/troubleshooting/cluster-health-check/#submit-a-job-via-the-user-interface","title":"Submit a Job via the user interface","text":"<p>Log into the Run:ai user interface, and verify that you have a <code>Researcher</code> or <code>Research Manager</code> role.  Go to the <code>Jobs</code> area. On the top right, press the button to create a Job. Once the form opens -- submit a Job. </p>"},{"location":"admin/troubleshooting/diagnostics/","title":"Diagnostic Tools","text":""},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-the-database-container","title":"Add Verbosity to the Database container","text":"<p>Run:ai Self-hosted installation contains an internal database. To diagnose database issues, you can run the database in debug mode.</p> <p>In the runai-backend-values, search for <code>postgresql</code>. Add: </p> <pre><code>postgresql:\nimage:\ndebug: true\n</code></pre> <p>Re-install the Run:ai control-plane and then review the database logs by running: </p> <pre><code>kubectl logs -n runai-backend runai-postgresql-0\n</code></pre>"},{"location":"admin/troubleshooting/diagnostics/#internal-networking-issues","title":"Internal Networking Issues","text":"<p>Run:ai is based on Kubernetes. Kubernetes runs its own internal subnet with a separate DNS service. If you see in the logs that services have trouble connecting, the problem may reside there.  You can find further information on how to debug Kubernetes DNS here. Specifically, it is useful to start a pod with networking utilities and use it for network resolution:</p> <pre><code>kubectl run -i --tty netutils --image=dersimn/netutils -- bash\n</code></pre>"},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-prometheus","title":"Add Verbosity to Prometheus","text":"<p>Add verbosity to Prometheus by editing RunaiConfig:</p> <pre><code>kubectl edit runaiconfig runai -n runai\n</code></pre> <p>Add a <code>debug</code> log level:</p> <pre><code>prometheus-operator:\nprometheus:\nprometheusSpec:\nlogLevel: debug\n</code></pre> <p>To view logs, run: <pre><code>kubectl logs prometheus-runai-prometheus-operator-prometheus-0 prometheus \\\n      -n monitoring -f --tail 100\n</code></pre></p>"},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-scheduler","title":"Add Verbosity to Scheduler","text":"<p>To view extended logs run:</p> <pre><code>kubectl edit ruaiconfig runai -n runai\n</code></pre> <p>Then under the <code>scheduler</code> section add:</p> <pre><code>runai-scheduler:\n   args:\n     verbosity: 6\n</code></pre> <p>Warning</p> <p>Verbose scheduler logs consume a significant amount of disk space.</p>"},{"location":"admin/troubleshooting/troubleshooting/","title":"Troubleshooting Run:ai","text":""},{"location":"admin/troubleshooting/troubleshooting/#installation","title":"Installation","text":"Upgrade fails with \"Ingress already exists\" <p>Symptom:  The installation fails with error: <code>Error: rendered manifests contain a resource that already exists. Unable to continue with install: IngressClass \"nginx\" in namespace \"\" exists</code></p> <p>Root cause: Run:ai installs <code>NGINX</code>, but there is an existing NGINX on the cluster. </p> <p>Resolution: In the Run:ai cluster YAML file, disable the installation of NGINX by setting:</p> <pre><code>ingress-nginx:\n    enabled: false\n</code></pre>"},{"location":"admin/troubleshooting/troubleshooting/#dashboard-issues","title":"Dashboard Issues","text":"No Metrics are showing on Dashboard <p>Symptom: No metrics are showing on dashboards at <code>https://&lt;company-name&gt;.run.ai/dashboards/now</code></p> <p>Typical root causes:</p> <ul> <li>Firewall-related issues.</li> <li>Internal clock is not synced.</li> <li>Prometheus pods are not running.</li> </ul> <p>Firewall issues</p> <p>Add verbosity to Prometheus as describe here.Verify that there are no errors. If there are connectivity-related errors you may need to:</p> <ul> <li>Check your firewall for outbound connections. See the required permitted URL list in Network requirements.</li> <li>If you need to set up an internet proxy or certificate, please contact Run:ai customer support. </li> </ul> <p>Machine Clocks are not synced</p> <p>Run: <code>date</code> on cluster nodes and verify that date/time is correct.  If not:</p> <ul> <li>Set the Linux time service (NTP).</li> <li>Restart Run:ai services. Depending on the previous time gap between servers, you may need to reinstall the Run:ai cluster</li> </ul> <p>Prometheus pods are not running</p> <p>Run: <code>kubectl get pods -n monitoring -o wide</code></p> <ul> <li>Verify that all pods are running.</li> <li>The default Prometheus installation is not built for high availability. If a node is down, the Prometheus pod may not recover by itself unless manually deleted. Delete the pod to see it start on a different node and consider adding a second replica to Prometheus.</li> </ul> GPU Relates metrics not showing <p>Symptom: GPU-related metrics such as <code>GPU Nodes</code> and <code>Total GPUs</code> are showing zero but other metrics, such as <code>Cluster load</code> are shown.</p> <p>Root cause: An installation issue related to the NVIDIA stack.</p> <p>Resolution: </p> <p>Need to run through the NVIDIA stack and find the issue. The current NVIDIA stack looks as follows:</p> <ol> <li>NVIDIA Drivers (at the OS level, on every node)</li> <li>NVIDIA Docker (extension to Docker, on every node)</li> <li>Kubernetes Node feature discovery (mark node properties)</li> <li>NVIDIA GPU Feature discovery (mark nodes as \u201chaving GPUs\u201d)</li> <li>NVIDIA Device plug-in (Exposes GPUs to Kubernetes)</li> <li>NVIDIA DCGM Exporter (Exposes metrics from GPUs in Kubernetes)</li> </ol> <p>Run:ai requires the installation of the NVIDIA GPU Operator which installs the entire stack above. However, there are two alternative methods for using the operator:</p> <ul> <li>Use the default operator values to install 1 through 6.</li> <li>If  NVIDIA Drivers (#1 above) are already installed on all nodes, use the operator with a flag that disables drivers install. </li> </ul> <p>For more information see Cluster prerequisites.</p> <p>NVIDIA GPU Operator</p> <p>Run: <code>kubectl get pods -n gpu-operator | grep nvidia</code> and verify that all pods are running.</p> <p>Node and GPU feature discovery</p> <p>Kubernetes Node feature discovery identifies and annotates nodes. NVIDIA GPU Feature Discovery identifies and annotates nodes with GPU properties. See that: </p> <ul> <li>All such pods are up.</li> <li>The GPU feature discovery pod is available for every node with a GPU.</li> <li>And finally, when describing nodes, they show an active <code>gpu/nvidia</code> resource.</li> </ul> <p>NVIDIA Drivers</p> <ul> <li>If NVIDIA drivers have been installed on the nodes themselves, ssh into each node and run <code>nvidia-smi</code>. Run <code>sudo systemctl status docker</code> and verify that docker is running. Run <code>nvidia-docker</code> and verify that it is installed and working.  Linux software upgrades may require a node restart.</li> <li>If NVIDIA drivers are installed by the Operator, verify that the NVIDIA driver daemonset has created a pod for each node and that all nodes are running. Review the logs of all such pods. A typical problem may be the driver version which is too advanced for the GPU hardware. You can set the driver version via operator flags. </li> </ul> <p>NVIDIA DCGM Exporter</p> <ul> <li>View the logs of the DCGM exporter pod and verify that no errors are prohibiting the sending of metrics. </li> <li>To validate that the dcgm-exporter exposes metrics, find one of the DCGM Exporter pods and run:</li> </ul> <pre><code>kubectl port-forward &lt;dcgm-exporter-pod-name&gt; 9400:9400\n</code></pre> <p>Then browse to http://localhost:9400/metrics and verify that the metrics have reached the DCGM exporter.</p> <ul> <li>The next step after the DCGM Exporter is <code>Prometheus</code>. To validate that metrics from the DCGM Exporter reach Prometheus, run:</li> </ul> <pre><code>kubectl port-forward svc/runai-cluster-kube-prometh-prometheus -n monitoring 9090:9090\n</code></pre> <p>Then browse to localhost:9090. In the UI, type <code>DCGM_FI_DEV_GPU_UTIL</code> as the metric name, and verify that the metric has reached Prometheus. </p> <p>If the DCGM Exporter is running correctly and exposing metrics, but this metric does not appear in Prometheus, there may be a connectivity issue between these components.</p> Allocation-related metrics not showing <p>Symptom: GPU Allocation-related metrics such as <code>Allocated GPUs</code> are showing zero but other metrics, such as <code>Cluster load</code> are shown.</p> <p>Root cause: The origin of such metrics is the scheduler. </p> <p>Resolution:</p> <ul> <li>Run: <code>kubectl get pods -n runai | grep scheduler</code>. Verify that the pod is running.</li> <li>Review the scheduler logs and look for errors. If such errors exist, contact Run:ai customer support. </li> </ul> All metrics are showing \"No Data\" <p>Symptom: All data on all dashboards is showing the text \"No Data\".</p> <p>Root cause: Internal issue with metrics infrastructure.</p> <p>Resolution: Please contact Run:ai customer support.</p>"},{"location":"admin/troubleshooting/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"After a successful login, you are redirected to the same login page <p>For a self-hosted installation, check Linux clock synchronization as described above. Use the Run:ai pre-install script to test this automatically. </p> Single-sign-on issues <p>For single-sign-on issues, see the troubleshooting section in the single-sign-on configuration document. </p>"},{"location":"admin/troubleshooting/troubleshooting/#user-interface-submit-job-issues","title":"User Interface Submit Job Issues","text":"New Job button is grayed out <p>Symptom: The <code>New Job</code> button on the top right of the Job list is grayed out.</p> <p>Root Cause: This can happen due to multiple configuration issues: </p> <ul> <li>Open Chrome developer tools and refresh the screen.</li> <li>Under <code>Network</code> locate a network call error. Search for the HTTP error code.</li> </ul> <p>Resolution for 401 HTTP Error</p> <ul> <li>The Cluster certificate provided as part of the installation is valid and trusted (not self-signed).</li> <li>Researcher Authentication has not been properly configured. Try running <code>runai login</code> from the Command-line interface. Alternatively, run: <code>kubectl get pods -n kube-system</code>, identify the api-server pod and review its logs. </li> </ul> <p>Resolution for 403 HTTP Error</p> <p>Run: <code>kubectl get pods -n runai</code>, identify the <code>agent</code> pod, see that it's running, and review its logs.</p> New Job button is not showing <p>Symptom: The <code>New Job</code> button on the top right of the Job list does not show.</p> <p>Root Causes: (multiple)</p> <ul> <li>You do not have <code>Researcher</code> or <code>Research Manager</code> permissions.</li> <li>Under <code>Settings | General</code>, verify that <code>Unified UI</code> is on.</li> </ul> Submit form is distorted <p>Symptom: Submit form is showing vertical lines.</p> <p>Root Cause: The control plane does not know the cluster URL.</p> <p>Using the Run:ai user interface, go to the Clusters list. See that there is no cluster URL next to your cluster.</p> <p>Resolution: Cluster must be re-installed. </p> Submit form does not show the list of Projects <p>Symptom: When connected with Single-sign-on, in the Submit form, the list of Projects is empty.</p> <p>Root Cause:  SSO is on and researcher authentication is not properly configured as such.</p> <p>Resolution: Verify API Server settings as described in Researcher Authentication configuration.</p> Job form is not opening on OpenShift <p>Symptom: When clicking on \"New Job\" the Job forms does not load. Network shows 405</p> <p>Root Cause: An installation step has been missed. </p> <p>Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a <code>patch</code> command at the end of the instruction set. Run it. </p>"},{"location":"admin/troubleshooting/troubleshooting/#networking-issues","title":"Networking Issues","text":"'admission controller' connectivity issue <p>Symptoms:</p> <ul> <li>Pods are failing with 'admission controller' connectivity errors.</li> <li>The command-line <code>runai submit</code> fails with an 'admission controller' connectivity error.</li> <li>Agent or cluster sync pods are crashing in self-hosted installation.</li> </ul> <p>Root cause: Connectivity issues between different nodes in the cluster.</p> <p>Resolution:</p> <ul> <li>Run the preinstall script and search for networking errors.</li> <li>Run: <code>kubectl get pods -n kube-system -o wide</code>. Verify that all networking pods are running. </li> <li>Run: <code>kubectl get nodes</code>. Check that all nodes are ready and connected.</li> <li>Run: <code>kubectl get pods -o wide -A</code> to see which pods are Pending or in Error and which nodes they belong to. </li> <li>See if pods from different nodes have trouble communicating with each other.</li> <li>Advanced, run: <code>kubectl exec &lt;pod-name&gt; -it /bin/sh</code> from a pod in one node and ping a pod from another. </li> </ul> Projects are not syncing <p>Symptom: Create a Project on the Run:ai user interface, then run: <code>runai list projects</code>. The new Project does not appear.</p> <p>Root cause: The Run:ai agent is not syncing properly. This may be due to firewall issues. </p> <p>Resolution</p> <ul> <li>Run: <code>runai pods -n runai | grep agent</code>. See that the agent is in Running state. Select the agent's full name and run: <code>kubectl logs -n runai runai-agent-&lt;id&gt;</code>.</li> <li>Verify that there are no errors. If there are connectivity-related errors you may need to check your firewall for outbound connections. See the required permitted URL list in Network requirements. </li> <li>If you need to set up an internet proxy or certificate, please contact Run:ai customer support. </li> </ul> Jobs are not syncing <p>Symptom: A Job on the cluster (<code>runai list jobs</code>) does not show in the Run:ai user interface Job list. </p> <p>Root cause: The Run:ai cluster-sync pod is not syncing properly.  </p> <p>Resolution: Search the cluster-sync pod for errors.</p>"},{"location":"admin/troubleshooting/troubleshooting/#job-related-issues","title":"Job-related Issues","text":"Jobs fail with ContainerCannotRun status  <p>Symptom: When running <code>runai list jobs</code>, your Job has a status of <code>ContainerCannotRun</code>.</p> <p>Root Cause: The issue may be caused due to an unattended upgrade of the NVIDIA driver.</p> <p>To verify, run: <code>runai describe job &lt;job-name&gt;</code>, and search for an error <code>driver/library version mismatch</code>.</p> <p>Resolution: Reboot the node on which the Job attempted to run.</p> <p>Going forward, we recommend blacklisting NVIDIA driver from unattended-upgrades. You can do that by editing <code>/etc/apt/apt.conf.d/50unattended-upgrades</code>, and adding <code>nvidia-driver-</code> to the <code>Unattended-Upgrade::Package-Blacklist</code> section. It should look something like that:</p> <pre><code>Unattended-Upgrade::Package-Blacklist {\n    // The following matches all packages starting with linux-\n    //  \"linux-\";\n    \"nvidia-driver-\";\n</code></pre>"},{"location":"admin/troubleshooting/troubleshooting/#inference-issues","title":"Inference Issues","text":"New Deployment button is grayed out <p>Symptoms: </p> <ul> <li>The <code>New Deployment</code> button on the top right of the Deployment list is grayed out.</li> <li>Cannot create a deployment via Inference API.</li> </ul> <p>Root Cause: Run:ai Inference prerequisites have not been met.</p> <p>Resolution: Review inference prerequisites and install accordingly.</p> New Deployment button is not showing <p>Symptom: The <code>New Deployment</code> button on the top right of the Deployments list does not show.</p> <p>Root Cause: You do not have <code>ML Engineer</code> permissions.</p> Submitted Deployment remains in Pending state <p>Symptom: A submitted deployment is not running.</p> <p>Root Cause: The patch statement to add the runai-scheduler has not been performed.</p> Some Autoscaling metrics are not working <p>Symptom: Deployments do not autoscale when using metrics other than <code>requests-per-second</code> or <code>concurrency</code>.</p> <p>Root Cause: The horizontal pod autoscaler prerequisite has not been installed. </p> Deployment status is \"Failed\" <p>Symptom: Deployment status is always <code>Failed</code>.</p> <p>Root Cause: (multiple)</p> <ul> <li>Not enough resources in the cluster.</li> <li>Server model command is misconfigured (i.e sleep infinity).</li> <li>Server port is misconfigured. </li> </ul> Deployment does not scale up from zero <p>Symptom: In the Deployment form, when \"Auto-scaling\" is enabled, and \"Minimum Replicas\" is set to zero, the deployment cannot scale up from zero.</p> <p>Root Cause: </p> <ul> <li>Clients are not sending requests.</li> <li>Clients are not using the same port/protocol as the server model.</li> <li>Server model command is misconfigured (i.e sleep infinity).</li> </ul>"},{"location":"admin/troubleshooting/troubleshooting/#command-line-interface-issues","title":"Command-line interface Issues","text":"Unable to install CLI due to certificate errors <p>Symptom: The curl command and download button to download the CLI is not working.</p> <p>Root Cause: The cluster is not accessible from the download location</p> <p>Resolution: </p> <p>Use an alternate method for downloading the CLI. Run:</p> <pre><code>kubectl port-forward -n runai svc/researcher-service 4180\n</code></pre> <p>In another shell, run: <pre><code>wget --content-disposition http://localhost:4180/cli/linux\n</code></pre></p> When running the CLI you get an error: open .../.kube/config.lock: permission denied <p>Symptom: When running any CLI command you get a permission denied error.</p> <p>Root Cause: The user running the CLI does not have read permissions to the <code>.kube</code> directory.</p> <p>Resolution: Change permissions for the directory.</p> When running 'runai logs', the logs are delayed <p>Symptom: Printout from the container is not immediately shown in the log. </p> <p>Root Cause: By default, Python buffers stdout, and stderr, which are not flushed in real-time. This may cause logs to appear sometimes minutes after being buffered.</p> <p>Resolution: Set the env var PYTHONUNBUFFERED to any non-empty string or pass -u to Python. e.g. <code>python -u main.py</code>.</p> CLI does not download properly on OpenShift <p>Symptom: When trying to download the CLI on OpenShift, the <code>wget</code> statement downloads a text file named <code>darwin</code> or <code>linux</code> rather than the binary <code>runai</code>.</p> <p>Root Cause: An installation step has been missed. </p> <p>Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a <code>patch</code> command at the end of the instruction set. Run it. </p>"},{"location":"admin/workloads/inference-overview/","title":"Inference","text":""},{"location":"admin/workloads/inference-overview/#what-is-inference","title":"What is Inference","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"admin/workloads/inference-overview/#inference-and-gpus","title":"Inference and GPUs","text":"<p>The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process. </p> <p>Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory. </p>"},{"location":"admin/workloads/inference-overview/#inference-runai","title":"Inference @Run:ai","text":"<p>Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.</p> <ul> <li> <p>Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.</p> </li> <li> <p>Inference workloads will receive priority over Train and Build workloads during scheduling.</p> </li> <li> <p>Inference is implemented as a Kubernetes Deployment object with a defined number of replicas. The replicas are load-balanced by Kubernetes so adding more replicas will improve the overall throughput of the system.</p> </li> <li> <p>Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.</p> </li> <li> <p>Inference workloads can be submitted via Run:ai user interface as well as Run:ai API. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect. </p> </li> </ul>"},{"location":"admin/workloads/inference-overview/#auto-scaling","title":"Auto Scaling","text":"<p>To withstand SLA, Inference workloads are typically set with auto scaling. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle.</p> <p>There are a number of ways to trigger auto-scaling. Run:ai supports the following:</p> Metric Units Run:ai name GPU Utilization % gpu-utilization CPU Utilization % cpu-utilization Latency milliseconds latency Throughput requests/second throughput Concurrency concurrency Custom metric custom <p>The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration.</p> <p>Auto Scaling also supports a scale to zero policy with Throughput and Concurrency metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0. This has the benefit of conserving resources at the risk of a delay from \"cold starting\" the model when traffic resumes. </p>"},{"location":"admin/workloads/inference-overview/#see-also","title":"See Also","text":"<ul> <li>To set up Inference, see Cluster installation prerequisites.</li> <li>For running Inference see Inference quick-start.</li> <li>To run Inference from the user interface see Deployments.</li> <li>To run Inference using API see Workload overview.</li> </ul>"},{"location":"admin/workloads/policies/","title":"Configure Policies","text":""},{"location":"admin/workloads/policies/#what-are-policies","title":"What are Policies?","text":"<p>Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For example:</p> <ol> <li>Restrict researchers from requesting more than 2 GPUs, or less than 1GB of memory for an interactive workload.</li> <li>Set the default memory of each training job to 1GB, or mount a default volume to be used by any submitted Workload.</li> </ol> <p>Policies are stored as Kubernetes custom resources.</p> <p>Policies are specific to Workload type as such there are several kinds of Policies:</p> Workload Type Kubernetes Workload Name Kubernetes Policy Name Interactive <code>InteractiveWorkload</code> <code>InteractivePolicy</code> Training <code>TrainingWorkload</code> <code>TrainingPolicy</code> Distributed Training <code>DistributedWorkload</code> <code>DistributedPolicy</code> Inference <code>InferenceWorkload</code> <code>InferencePolicy</code> <p>A Policy can be created per Run:ai Project (Kubernetes namespace). Additionally, a Policy resource can be created in the <code>runai</code> namespace. This special Policy will take effect when there is no project-specific Policy for the relevant workload kind.</p> <p>When researchers create a new interactive workload or workspace, they see list of available node pools and their priority. Priority is set by dragging and dropping the node pools in the desired order of priority. When the node pool priority list is locked by an administrator policy, the node pool list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</p>"},{"location":"admin/workloads/policies/#creating-a-policy","title":"Creating a Policy","text":""},{"location":"admin/workloads/policies/#creating-your-first-policy","title":"Creating your First Policy","text":"<p>To create a sample <code>InteractivePolicy</code>, prepare a file (e.g. <code>policy.yaml</code>) containing the following YAML:</p> gpupolicy.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\nname: interactive-policy1\nnamespace: runai-team-a # (1)\nspec:\ngpu:\nrules:\nrequired: true\nmin: \"1\"  # (2)\nmax: \"4\"  value: \"1\"\n</code></pre> <ol> <li>Set the Project namespace here.</li> <li>GPU values are quoted as they can contain non-integer values.</li> </ol> <p>The policy places a default and limit on the available values for GPU allocation. To apply this policy, run:</p> <p><pre><code>kubectl apply -f gpupolicy.yaml </code></pre> Now, try the following command: <pre><code>runai submit --gpu 5 --interactive -p team-a\n</code></pre> The following message will appear: <pre><code>gpu: must be no greater than 4\n</code></pre> A similar message will appear in the New Job form of the Run:ai user interface, when attempting to enter the number of GPUs, which is out of range for an Interactive tab.  </p>"},{"location":"admin/workloads/policies/#read-only-values","title":"Read-only values","text":"<p>When you do not want the user to be able to change a value, you can force the corresponding user interface control to become read-only by using the <code>canEdit</code> key. For example, </p> runasuserpolicy.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: train-policy1\nnamespace: runai-team-a # (1) \nspec:\nrunAsUser:\nrules:\nrequired: true  # (2)\ncanEdit: false  # (3)\nvalue: true # (4)\n</code></pre> <ol> <li>Set the Project namespace here.</li> <li>The field is required. </li> <li>The field will be shown as read-only in the user interface. </li> <li>The field value is true.  </li> </ol>"},{"location":"admin/workloads/policies/#complex-values","title":"Complex Values","text":"<p>The example above illustrated rules for parameters of \"primitive\" types, such as GPU allocation, CPU memory, working directory, etc. These parameters contain a single value. </p> <p>Other workload parameters, such as ports or volumes, are \"complex\", in the sense that they may contain multiple values: a workload may contain multiple ports and multiple volumes. </p> <p>The following is an example of a policy containing the value <code>ports</code>, which is complex: The <code>ports</code> flag typically contains two values: The <code>external</code> port that is mapped to an internal <code>container</code> port. One can have multiple port tuples defined for a single Workload:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\nname: interactive-policy\nnamespace: runai\nspec:\nports:\nrules:\ncanAdd: true\nitemRules:\ncontainer:\nmin: 30000\nmax: 32767\nexternal:\nmax: 32767\nitems:\nadmin-port-a:\nrules:\ncanRemove: false\ncanEdit: false\nvalue:\ncontainer: 30100\nexternal: 8080\nadmin-port-b:\nvalue:\ncontainer: 30101\nexternal: 8081\n</code></pre> <p>A policy for a complex field is composed of three parts:</p> <ul> <li>Rules: Rules apply to the <code>ports</code> parameter as a whole. In this example, the administrator specifies <code>canAdd</code> rule with <code>true</code> value, indicating that a researcher submitting an interactive job can add additional ports to the ports listed by the policy (true is the default for <code>canAdd</code>, so it actually could have been omitted from the policy above). When <code>canAdd</code> is set to <code>false</code>, the researcher will not be able to add any additional port except those already specified by the policy.</li> <li>itemRules: itemRules impose restrictions on the data members of each item, in this case - <code>container</code> and <code>external</code>. In the above example, the administrator has limited the value of <code>container</code> to 30000-32767, and the value of <code>external</code> to a maximum of 32767.</li> <li>Items: Specifies a list of default ports. Each port is an item in the ports list and given a label (e.g. <code>admin-port-b</code>). The administrator can also specify whether a researcher can change/delete ports from the submitted workload. In the above example, <code>admin-port-a</code> is hardwired and cannot be changed or deleted, while <code>admin-port-b</code> can be changed or deleted by the researcher when submitting the Workload. It is possible to specify a label using the reserved name of <code>DEFAULTS</code>. This item provides the defaults for all other items.</li> </ul> <p>The following is an example of a complex policy for PVCs which contains <code>DEFAULTS</code>.</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: tp # use your name.\n  namespace: runai-team-a # use your namespace\nspec:\n  pvcs:\n    itemRules:\n      existingPvc:\n        canEdit: false\n      claimName:\n        required: true\n    items:\n      DEFAULTS:\n        value:\n          existingPvc: true\n          path: nil\n</code></pre>"},{"location":"admin/workloads/policies/#syntax","title":"Syntax","text":"<p>The complete syntax of the policy YAML can be obtained using the <code>explain</code> command of kubectl. For example:</p> <p><pre><code>kubectl explain trainingpolicy.spec\n</code></pre> Should provide the list of all possible fields in the spec of training policies:</p> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this TrainingPolicy\nFIELDS:\nannotations &lt;Object&gt;\nSpecifies annotations to be set in the container running the created\nworkload.\narguments   &lt;Object&gt;\nIf set, the arguments are sent along with the command which overrides the\nimage's entry point of the created workload.\ncommand &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\n...\n</code></pre> <p>You can further drill down to get the syntax for <code>ports</code> by running:</p> <pre><code>kubectl explain trainingpolicy.spec.ports\n</code></pre> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\nRESOURCE: ports &lt;Object&gt;\nDESCRIPTION:\nSpecify the set of ports exposed from the container running the created\nworkload. Used together with --service-type.\nFIELDS:\nitemRules    &lt;Object&gt;\nitems    &lt;map[string]Object&gt;\nrules    &lt;Object&gt;\nthese rules apply to a value of type map (=non primitive) as a whole\nadditionally there are rules which apply for specific items of the map\n</code></pre> <p>Drill down into the <code>ports.rules</code> object by running:</p> <pre><code>kubectl explain trainingpolicy.spec.ports.rules\n</code></pre> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/\nRESOURCE: rules &lt;Object&gt;\nDESCRIPTION:\nthese rules apply to a value of type map (=non primitive) as a whole\nadditionally there are rules which apply for specific items of the map\nFIELDS:\ncanAdd   &lt;boolean&gt;\nis it allowed for a workload to add items to this map\nrequired &lt;boolean&gt;\nif the map as a whole is required\n</code></pre> <p>Note that each kind of policy has a slightly different set of parameters. For example, an <code>InteractivePolicy</code> has a <code>jupyter</code> parameter that is not available under <code>TrainingPolicy</code>. </p>"},{"location":"admin/workloads/policies/#using-secrets-for-environment-variables","title":"Using Secrets for Environment Variables","text":"<p>It is possible to add values from Kubernetes secrets as the value of environment variables included in the policy. The secret will be extracted from the secret object when the Job is created. For example:</p> <pre><code>  environment:\nitems:\nMYPASSWORD:\nvalue: \"SECRET:my-secret,password\"\n</code></pre> <p>When submitting a workload that is affected by this policy, the created container will have an environment variable called <code>MYPASSWORD</code> whose value is the key <code>password</code> residing in Kubernetes secret <code>my-secret</code> which has been pre-created in the namespace where the workload runs.</p> <p>Note</p> <p>Run:ai provides a secret propagation mechanism from the <code>runai</code> namespace to all project namespaces. For further information see secret propagation</p>"},{"location":"admin/workloads/policies/#terminate-runai-training-jobs-after-preemption-policy","title":"Terminate Run:ai training Jobs after preemption policy","text":"<p>Administrators can set a \u2018termination after preemption\u2019 policy to Run:ai training jobs. After applying this policy, a training job will be terminated once it has been preempted from any reason. For example, a training job that is using over-quota resources (e.g. GPUs) and the owner of those GPUs wants to reclaim them back, the Training job is preempted and typically goes back to the pending queue. However, if the termination policy is applied, the job is terminated instead of reinstated as pending. The Termination after Preemption Policy can be set as a cluster-wide policy (applicable to all namespaces/projects) or per project/namespace.</p> <p>To use this feature the administrator should configure either a cluster wide or namespace policy.</p> <p>For cluster wide (all namespaces/projects) use this YAML based policy:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: training-policy\nnamespace: runai\nspec:\nterminateAfterPreemption:\nvalue: true\n</code></pre> <p>For per namespace (project) use this YAML based policy:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: training-policy\nnamespace: runai-&lt;PROJECT_NAME&gt;\nspec:\nterminateAfterPreemption:\nvalue: false\n</code></pre>"},{"location":"admin/workloads/policies/#modifyingdeleting-policies","title":"Modifying/Deleting Policies","text":"<p>Use the standard kubectl get/apply/delete commands to modify and delete policies.</p> <p>For example, to view the global interactive policy:</p> <pre><code>kubectl get interactivepolicies -n runai\n</code></pre> <p>Should return the following:</p> <pre><code>NAME                 AGE\ninteractive-policy   2d3h\n</code></pre> <p>To delete this policy:</p> <pre><code>kubectl delete InteractivePolicy interactive-policy -n runai\n</code></pre> <p>To access project-specific policies, replace the <code>-n runai</code> parameter with the namespace of the relevant project.</p>"},{"location":"admin/workloads/policies/#see-also","title":"See Also","text":"<ul> <li>For creating workloads based on policies, see the Run:ai submitting workloads</li> </ul>"},{"location":"admin/workloads/secrets/","title":"Secrets in Workloads","text":""},{"location":"admin/workloads/secrets/#kubernetes-secrets","title":"Kubernetes Secrets","text":"<p>Sometimes you want to use sensitive information within your code. For example passwords, OAuth tokens, or ssh keys. The best practice for saving such information in Kubernetes is via Kubernetes Secrets. Kubernetes Secrets let you store and manage sensitive information. Access to secrets is limited via configuration.</p> <p>A Kubernetes secret may hold multiple key - value pairs.</p>"},{"location":"admin/workloads/secrets/#using-secrets-in-runai-workloads","title":"Using Secrets in Run:ai Workloads","text":"<p>Our goal is to provide Run:ai Workloads with secrets as input in a secure way. Using the Run:ai command line, you will be able to pass a reference to a secret that already exists in Kubernetes. </p>"},{"location":"admin/workloads/secrets/#creating-a-secret","title":"Creating a secret","text":"<p>For details on how to create a Kubernetes secret see: https://kubernetes.io/docs/concepts/configuration/secret/. Here is an example:</p> <pre><code>apiVersion: v1\nkind: Secret\nmetadata:\nname: my-secret\nnamespace: runai-&lt;project-name&gt;\ndata:\nusername: am9obgo=\npassword: bXktcGFzc3dvcmQK\n</code></pre> <p>Then run: <pre><code>kubectl apply -f &lt;file-name&gt;\n</code></pre></p> <p>Notes</p> <ul> <li>Secrets are base64 encoded</li> <li>Secrets are stored in the scope of a namespace and will not be accessible from other namespaces. Hence the reference to the Run:ai Project name above. Run:ai provides the ability to propagate secrets throughout all Run:ai Projects. See below.</li> </ul>"},{"location":"admin/workloads/secrets/#attaching-a-secret-to-a-workload-on-submit","title":"Attaching a secret to a Workload on Submit","text":"<p>When you submit a new Workload, you will want to connect the secret to the new Workload. To do that, run:</p> <pre><code>runai submit -e &lt;ENV-VARIABLE&gt;=SECRET:&lt;secret-name&gt;,&lt;secret-key&gt; ....\n</code></pre> <p>For example:</p> <pre><code>runai submit -i ubuntu -e MYUSERNAME=SECRET:my-secret,username\n</code></pre>"},{"location":"admin/workloads/secrets/#secrets-and-projects","title":"Secrets and Projects","text":"<p>As per the note above, secrets are namespace-specific. If your secret relates to all Run:ai Projects, do the following to propagate the secret to all Projects:</p> <ul> <li>Create a secret within the <code>runai</code> namespace.</li> <li>Run the following once to allow Run:ai to propagate the secret to all Run:ai Projects:</li> </ul> <pre><code>runai-adm set secret &lt;secret name&gt; --cluster-wide\n</code></pre> <p>Reminder</p> <p>The Run:ai Administrator CLI can be obtained here.</p> <p>To delete a secret from all Run:ai Projects, run:</p> <pre><code>runai-adm remove secret &lt;secret name&gt; --cluster-wide\n</code></pre>"},{"location":"admin/workloads/secrets/#secrets-and-policies","title":"Secrets and Policies","text":"<p>A Secret can be set at the policy level. For additional information see policies guide</p>"},{"location":"admin/workloads/workload-overview-admin/","title":"Workloads Overview","text":""},{"location":"admin/workloads/workload-overview-admin/#workloads","title":"Workloads","text":"<p>Run:ai schedules Workloads. Run:ai workloads are comprised of:</p> <ul> <li>The Kubernetes object (Job, Deployment, etc) which is used to launch the container, inside which the data science code runs. </li> <li>A set of additional resources that are required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network, and more. </li> </ul> <p>All of these components are created together and deleted together when the Workload ends. </p> <p>Run:ai currently supports the following Workloads types:</p> Workload Type Kubernetes Name Description Interactive <code>InteractiveWorkload</code> Submit an interactive workload Training <code>TrainingWorkload</code> Submit a training workload Distributed Training <code>DistributedWorkload</code> Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference <code>InferenceWorkload</code> Submit an inference workload"},{"location":"admin/workloads/workload-overview-admin/#values","title":"Values","text":"<p>A Workload will typically have a list of values (sometimes called flags), such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.</p>"},{"location":"admin/workloads/workload-overview-admin/#how-to-submit","title":"How to Submit","text":"<p>A Workload can be submitted via various channels:</p> <ul> <li>The Run:ai user interface.</li> <li>The Run:ai command-line interface, via the runai submit command.</li> <li>The Run:ai Cluster API.</li> </ul>"},{"location":"admin/workloads/workload-overview-admin/#workload-policies","title":"Workload Policies","text":"<p>As an administrator, you can set Policies on Workloads.  Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For more information see Workload Policies.</p>"},{"location":"developer/overview-developer/","title":"Overview: Developer Documentation","text":"<p>Developers can access Run:ai through various programmatic interfaces. </p>"},{"location":"developer/overview-developer/#api-architecture","title":"API Architecture","text":"<p>Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.</p> <p>Below is a diagram of the Run:ai API Architecture. A developer may:</p> <ol> <li>Access the control plane via the Administrator API.</li> <li>Access any one of the GPU clusters via Cluster API.</li> <li>Access cluster metrics via the Metrics API.  </li> </ol> <p></p>"},{"location":"developer/overview-developer/#administrator-api","title":"Administrator API","text":"<p>Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users, and more. </p> <p>The API is provided as REST and is accessible via the control plane endpoint.  </p> <p>For more information see Administrator REST API. </p>"},{"location":"developer/overview-developer/#cluster-api","title":"Cluster API","text":"<p>Submit and delete Workloads. </p> <p>The API is provided as Kubernetes API.</p> <p>Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.</p> <p>Note</p> <p>The same functionality is also available via the Run:ai Command-line interface. The CLI provides an alternative for automating with shell scripts. </p>"},{"location":"developer/overview-developer/#metrics-api","title":"Metrics API","text":"<p>Retrieve metrics from multiple GPU clusters. </p> <p>See the Metrics API document.</p>"},{"location":"developer/overview-developer/#api-authentication","title":"API Authentication","text":"<p>See API Authentication for information on how to gain authenticated access to Run:ai APIs.</p>"},{"location":"developer/rest-auth/","title":"API Authentication","text":"<p>The following document explains how to authenticate with Run:ai APIs. </p> <p>Run:ai APIs are accessed using bearer tokens. A token can be obtained in several ways:</p> <ul> <li>When logging into the Run:ai user interface, you enter an email and password (or authenticated via single sign-on) which are used to obtain a token.</li> <li>When using the Run:ai command-line, you use a Kubernetes profile and are authenticated by pre-running <code>runai login</code> (or oc login with OpenShift). The command attachs a token to the profile and allows you access to Run:ai functionality.</li> <li>When using Run:ai APIs, you need to create an Application through the Run:ai user interface. The Application is created with specific roles and contains a secret. Using the secret you can obtain a token and use it within subsequent API calls.</li> </ul>"},{"location":"developer/rest-auth/#create-a-client-application","title":"Create a Client Application","text":"<ul> <li>Open the Run:ai Run:ai User Interface.</li> <li>Go to <code>Settings | Application</code> and create a new Application. </li> <li>Set the required roles:<ul> <li>Select <code>Researcher</code> to manipulate Jobs using the Cluster API. To provide access to a specific project, you will also need to go to <code>Application | Projects</code> and provide the Application with access to specific projects. </li> <li>Select <code>Editor</code> to manipulate Projects and Departments using the Administrator REST API. </li> <li>Select <code>Administrator</code> to manipulate Users, Tenant Settings and Clusters using the Administrator REST API.</li> </ul> </li> <li>Copy the <code>&lt;APPLICATION-NAME&gt;</code> and <code>&lt;CLIENT-SECRET&gt;</code> to be used below</li> <li>Go to <code>Settings | General</code>, under <code>Researcher Authentication</code> copy <code>&lt;REALM&gt;</code>.</li> </ul> <p>Important</p> <p>Creating Client Application tokens is only available with SaaS installations where the tenant has been created post-January 2022 or any Self-hosted installation. If you are an administrator but do not see the <code>Settings | Application</code> area, please contact Run:ai customer support.  </p>"},{"location":"developer/rest-auth/#request-an-api-token","title":"Request an API Token","text":"<p>Use the above parameters to get a temporary token to access Run:ai as follows. </p>"},{"location":"developer/rest-auth/#example-command-to-get-an-api-token","title":"Example command to get an API token","text":"<p>Replace <code>&lt;COMPANY-URL&gt;</code> below with  <code>app.run.ai</code> for SaaS installations (not <code>&lt;company&gt;.run.ai</code>) or the Run:ai user interface URL for Self-hosted installations.</p> cURLPython <pre><code>curl -X POST 'https://&lt;COMPANY-URL&gt;/auth/realms/&lt;REALM&gt;/protocol/openid-connect/token' \\\n--header 'Content-Type: application/x-www-form-urlencoded' \\\n--data-urlencode 'grant_type=client_credentials' \\\n--data-urlencode 'scope=openid' \\\n--data-urlencode 'response_type=id_token' \\\n--data-urlencode 'client_id=&lt;APPLICATION-NAME&gt;' \\\n--data-urlencode 'client_secret=&lt;CLIENT-SECRET&gt;'\n</code></pre> <pre><code>import http.client\nconn = http.client.HTTPSConnection(\"\")\npayload = \"grant_type=client_credentials&amp;client_id=&lt;APPLICATION-NAME&gt;&amp;client_secret=&lt;CLIENT_SECRET&gt;\"\nheaders = { 'content-type': \"application/x-www-form-urlencoded\" }\nconn.request(\"POST\", \"/&lt;COMPANY-URL&gt;/auth/realms/&lt;REALM&gt;/protocol/openid-connect/token\", payload, headers)\nres = conn.getresponse()\ndata = res.read()\nprint(data.decode(\"utf-8\"))\n</code></pre>"},{"location":"developer/rest-auth/#response","title":"Response","text":"<p>The API response will look as follows: </p> API Response<pre><code>{\n\"access_token\": \"...\", // (1)\n\"expires_in\": 36000,\n....\n\"token_type\": \"bearer\"\n\"id_token\": \"...\"\n}\n</code></pre> <ol> <li>Use the <code>id_token</code> as the Bearer token below.</li> </ol> <p>To call APIs, the application must pass the retrieved <code>access_token</code> as a Bearer token in the Authorization header of your HTTP request.</p> <ul> <li>To retrieve and manipulate Workloads, use the Cluster API. Researcher API works at the cluster level and you will have different endpoints for different clusters. </li> <li>To retrieve and manipulate other metadata objects, use the Administrator REST API. Administrator API works at the control-plane level and you have a single endpoint for all clusters. </li> </ul>"},{"location":"developer/admin-rest-api/overview/","title":"Administrator REST API","text":"<p>The purpose of the Administrator REST API is to provide an easy-to-use programming interface for administrative tasks.</p>"},{"location":"developer/admin-rest-api/overview/#endpoint-url-for-api","title":"Endpoint URL for API","text":"<p>The domain used for Administrator REST APIs is the same domain used to browse for the Run:ai User Interface. Either <code>&lt;company&gt;.run.ai</code>, or <code>app.run.ai</code> for older tenants or a custom URL used for Self-hosted installations.</p>"},{"location":"developer/admin-rest-api/overview/#authentication","title":"Authentication","text":"<ul> <li>Create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<code>&lt;ACCESS-TOKEN&gt;</code>). For details, see Calling REST APIs.</li> <li>Use the token for subsequent API calls. </li> </ul>"},{"location":"developer/admin-rest-api/overview/#example-usage","title":"Example Usage","text":"<p>For example, if you have an Administrator role, you can get a list of clusters by running:</p> cURLPython <pre><code>curl 'https://&lt;COMPANY-URL&gt;/v1/k8s/clusters' \\\n--header 'Accept: application/json' \\\n--header 'Content-Type: application/json' \\\n--header 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' \n</code></pre> <pre><code>import http.client\nconn = http.client.HTTPSConnection(\"https://&lt;COMPANY-URL&gt;\")\nheaders = {\n'content-type': \"application/json\",\n'authorization': \"Bearer &lt;ACCESS-TOKEN&gt;\"\n}\nconn.request(\"GET\", \"/v1/k8s/clusters\", headers=headers)\nres = conn.getresponse()\ndata = res.read()\nprint(data.decode(\"utf-8\"))\n</code></pre> <p>(replace <code>&lt;ACCESS-TOKEN&gt;</code> with the bearer token from above).</p> <p>For an additional example, see the following code. It is an example of how to use the Run:ai Administrator REST API to create a User and a Project and set the User to the Project.  </p>"},{"location":"developer/admin-rest-api/overview/#administrator-api-documentation","title":"Administrator API Documentation","text":"<p>The Administrator API provides the developer interfaces for getting and manipulating the Run:ai metadata objects such as Projects, Departments, Clusters, and Users.</p> <p>Detailed API documentation can be found at https://app.run.ai/api/docs. This represents the latest control-plane documentation. If you are running a self-hosted version, see <code>https://&lt;runai-company-url&gt;/api/docs</code>.</p> <p>Administrator API Documentation</p>"},{"location":"developer/cluster-api/other-resources/","title":"Support for other Kubernetes Applications","text":""},{"location":"developer/cluster-api/other-resources/#introduction","title":"Introduction","text":"<p>Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads. </p> <p>Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion. </p> <p>Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.</p>"},{"location":"developer/cluster-api/other-resources/#how-to","title":"How To","text":"<p>To run Kubernetes Workloads with Run:ai you must add the following to the YAML:</p> <ul> <li>A namespace that is associated with a Run:ai Project.</li> <li>A scheduler name: <code>runai-scheduler</code>.</li> <li>When using Fractions, use a specific syntax for the <code>nvidia/gpu</code> limit.</li> </ul>"},{"location":"developer/cluster-api/other-resources/#example-job","title":"Example: Job","text":"job1.yaml<pre><code>apiVersion: batch/v1\nkind: Job # (1)\nmetadata:\nname: job1\nnamespace: runai-team-a # (2)\nspec:\ntemplate:\nspec:\ncontainers:\n- name: job1-container\nimage: gcr.io/run-ai-demo/quickstart\nresources:\nlimits:\nnvidia.com/gpu: 1 # (4)\nrestartPolicy: Never\nschedulerName: runai-scheduler # (3)\n</code></pre> <ol> <li>This is a Kubernetes Job.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>The job to be scheduled with the Run:ai scheduler. </li> <li>To run with half a GPU replace 1 with \"0.5\" (with apostrophes).</li> </ol> <p>To submit the Job run:</p> <pre><code>kubectl apply -f job1.yaml\n</code></pre> <p>You will be able to see the Job in the Run:ai User interface, including all metrics and lists </p>"},{"location":"developer/cluster-api/other-resources/#example-deployment","title":"Example: Deployment","text":"deployment1.yaml<pre><code>apiVersion: apps/v1\nkind: Deployment # (1)\nmetadata:\nname: inference-1\nnamespace: runai-team-a # (2)\nspec:\nreplicas: 1\nselector:\nmatchLabels:\napp: inference-1\ntemplate:\nmetadata:\nlabels:\napp: inference-1\nspec:\ncontainers:\n- resources:\nlimits:\nnvidia.com/gpu: 1 # (4)\nimage: runai/example-marian-server\nimagePullPolicy: Always\nname: inference-1\nports:\n- containerPort: 8888\nschedulerName: runai-scheduler # (3)\n---\napiVersion: v1\nkind: Service # (5)\nmetadata:\nlabels:\napp: inference-1\nname: inference-1\nspec:\ntype: ClusterIP\nports:\n- port: 8888\ntargetPort: 8888\nselector:\napp: inference-1\n</code></pre> <ol> <li>This is a Kubernetes Deployment.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>The job to be scheduled with the Run:ai scheduler. </li> <li>To run with half a GPU replace 1 with \"0.5\" (with apostrophes).</li> <li>This example also contains the creation of a service to connect to the deployment. It is not mandatory.   </li> </ol> <p>To submit the Deployment run:</p> <pre><code>kubectl apply -f deployment1.yaml\n</code></pre>"},{"location":"developer/cluster-api/other-resources/#limitations","title":"Limitations","text":"<p>The Run:ai command line interface provides limited support for Kubernetes Workloads.</p>"},{"location":"developer/cluster-api/other-resources/#see-also","title":"See Also","text":"<p>Run:ai has specific integrations with additional third-party tools such as KubeFlow, MLFlow, and more. These integrations use the same instructions as described above. </p>"},{"location":"developer/cluster-api/submit-cron-yaml/","title":"Submit a Cron job via YAML","text":"<p> Version 2.10 and later.</p> <p>The cron command-line utility is a job scheduler typically used to set up and maintain software environments at scheduled intervals. Run:ai now supports submitting jobs with cron using a YAML file. </p> <p>To submit a job using cron, run the following command:</p> <pre><code>kubectl apply -f &lt;file_name&gt;.yaml\n</code></pre> <p>The following is an example YAML file:</p> <pre><code>apiVersion: batch/v1\nkind: CronJob\nmetadata:\nname: hello\nspec:\nschedule: \"* * * * *\"\njobTemplate:\nspec:\ntemplate:\nmetadata:\nlabels:\n- (Mandatory) runai/queue: team-a\nspec:\n(Mandatory) schedulerName: runai-scheduler\ncontainers:\n- name: hello\nimage: busybox:1.28\nimagePullPolicy: IfNotPresent\ncommand:\n- /bin/sh\n- -c\n- date; echo Hello from the Kubernetes cluster\nrestartPolicy: OnFailure\n(Optional) priorityClassName: build / train / inference / interactivePreemptible\n</code></pre>"},{"location":"developer/cluster-api/submit-rest/","title":"Submitting Workloads via HTTP/REST","text":"<p>You can submit Workloads via HTTP calls, using the Kubernetes REST API.</p>"},{"location":"developer/cluster-api/submit-rest/#submit-workload-example","title":"Submit Workload Example","text":"<p>To submit a workload via HTTP, run the following:</p> <pre><code>curl -X POST \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads' \\ \n--header 'Content-Type: application/yaml' \\\n--header 'Authorization: Bearer &lt;BEARER&gt;' \\  # (2) \n--data-raw 'apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload  # (3)\nmetadata:\n  name: job-1    spec:\n  gpu:\n    value: \"1\"\nimage:\n    value: gcr.io/run-ai-demo/quickstart\n  name:\n    value: job-1  </code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.</li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> <li>See Submitting a Workload via YAML for an explanation of the YAML-based workload.</li> </ol> <p>Run: <code>runai list jobs</code> to see the new Workload.</p>"},{"location":"developer/cluster-api/submit-rest/#delete-workload-example","title":"Delete Workload Example","text":"<p>To delete a workload run:</p> <pre><code>curl -X DELETE \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads/&lt;JOB-NAME&gt;' \\ \n--header 'Content-Type: application/yaml' \\\n--header 'Authorization: Bearer &lt;BEARER&gt;'   # (2)\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.  Replace <code>&lt;JOB-NAME&gt;</code> with the name of the Job. </li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> </ol>"},{"location":"developer/cluster-api/submit-rest/#suspendstop-workload-example","title":"Suspend/Stop workload example","text":"<p>To suspend or stop a workload run:</p> <pre><code>curl -X PATCH \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/interactiveworkload/&lt;JOB-NAME&gt;' \\\n--header 'Content-Type: application/json' --header 'Authorization: Bearer &lt;TOKEN&gt;'# (2) \n--data '{\"spec\":{\"active\": {\"value\": \"false\"}}}'\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.  Replace <code>&lt;JOB-NAME&gt;</code> with the name of the Job. </li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> </ol>"},{"location":"developer/cluster-api/submit-rest/#using-other-programming-languages","title":"Using other Programming Languages","text":"<p>You can use any Kubernetes client library together with the YAML documentation above to submit workloads via other programming languages. For more information see Kubernetes client libraries.</p>"},{"location":"developer/cluster-api/submit-rest/#python-example","title":"Python example","text":"<p>Create the following file and run it via python:</p> create-train.py<pre><code>import json\nimport requests\n# (1)\nurl = \"https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads\"\npayload = json.dumps({\n\"apiVersion\": \"run.ai/v2alpha1\",\n\"kind\": \"TrainingWorkload\",\n\"metadata\": {\n\"name\": \"train1\",\n\"namespace\": \"runai-team-a\"\n},\n\"spec\": {\n\"image\": {\n\"value\": \"gcr.io/run-ai-demo/quickstart\"\n},\n\"name\": {\n\"value\": \"train1\"\n},\n\"gpu\": {\n\"value\": \"1\"\n}\n}\n})\nheaders = {\n'Content-Type': 'application/json',\n'Authorization': 'Bearer &lt;TOKEN&gt;' #(2)\n}\nresponse = requests.request(\"POST\", url, headers=headers, data=payload) # (3)\nprint(json.dumps(json.loads(response.text), indent=4))\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code>or <code>inferenceworkloads</code> according to type.</li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> <li>if you do not have a valid certificate, you can add the flag <code>verify=False</code>.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/","title":"Submitting Workloads via YAML","text":"<p>You can use YAML to submit Workloads directly to Run:ai. Below are examples of how to create training, interactive and inference workloads via YAML.</p>"},{"location":"developer/cluster-api/submit-yaml/#submit-workload-example","title":"Submit Workload Example","text":"<p>Create a file named <code>training1.yaml</code> with the following text:</p> training1.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # (1)\nmetadata:\nname: job-1  # (2) \nnamespace: runai-team-a # (3)\nspec:\ngpu:\nvalue: \"1\"\nimage:\nvalue: gcr.io/run-ai-demo/quickstart\nname:\nvalue: job-1 # (4)\n</code></pre> <ol> <li>This is a Training workload.</li> <li>Kubernetes object name. Mandatory, but has no functional significance.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>Job name as appears in Run:ai. Can provide name, or create automatically if name prefix is configured. </li> </ol> <p>Change the namespace and run: <code>kubectl apply -f training1.yaml</code></p> <p>Run: <code>runai list jobs</code> to see the new Workload.</p>"},{"location":"developer/cluster-api/submit-yaml/#delete-workload-example","title":"Delete Workload Example","text":"<p>Run: <code>kubectl delete -f training1.yaml</code> to delete the Workload. </p>"},{"location":"developer/cluster-api/submit-yaml/#creating-a-yaml-syntax-from-a-cli-command","title":"Creating a YAML syntax from a CLI command","text":"<p>An easy way to create a YAML for a workload is to generate it from the <code>runai submit</code> command by using the <code>--dry-run</code> flag. For example, run:</p> <pre><code>runai submit build1 -i ubuntu -g 1 --interactive --dry-run \\\n     -- sleep infinity \n</code></pre> <p>The result will be the following Kubernetes object declaration:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractiveWorkload  # (1)\nmetadata:\ncreationTimestamp: null\nlabels:\nPreviousJob: \"true\"\nname: job-0-2022-05-02t08-50-57\nnamespace: runai-team-a\nspec:\ncommand:\nvalue: sleep infinity\ngpu:\nvalue: \"1\"\nimage:\nvalue: ubuntu\nimagePullPolicy:\nvalue: Always\nname:\nvalue: job-0\n... Additional internal and status properties...\n</code></pre> <ol> <li>This is an Interactive workload.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/#inference-workload-example","title":"Inference Workload Example","text":"<p>Creating an inference workload is similar to the above two examples.</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InferenceWorkload\nmetadata:\nname: inference1\nnamespace: runai-team-a\nspec:\nname:\nvalue: inference1\ngpu:\nvalue: \"0.5\"\nimage:\nvalue: \"gcr.io/run-ai-demo/example-triton-server\"\nminScale:\nvalue: 1\nmaxScale:\nvalue: 2\nmetric:\nvalue: concurrency # (1)\ntarget:\nvalue: 80  # (2)\nports:\nitems:\nport1:\nvalue:\ncontainer: 8000\n</code></pre> <ol> <li>Possible metrics can be <code>cpu-utilization</code>, <code>latency</code>, <code>throughput</code>, <code>concurrency</code>, <code>gpu-utilization</code>, <code>custom</code>. Different metrics may require additional installations at the cluster level. </li> <li>Inference requires a port to receive requests.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/#suspendresume-interactivetraining-workload","title":"Suspend/Resume Interactive/Training Workload","text":"<p>to suspend trainig <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # \nmetadata:\nname: job-1  #  \nnamespace: runai-team-a # \nspec:\ngpu:\nvalue: \"1\"\nactive:\nvalue: false\nimage:\nvalue: gcr.io/run-ai-demo/quickstart\nname:\nvalue: job-1 # \n</code></pre> In order to suspend workload set <code>active</code> value to <code>false</code> To reume it back either set <code>active</code> value to <code>true</code> or remove it entirly. </p>"},{"location":"developer/cluster-api/submit-yaml/#see-also","title":"See Also","text":"<ul> <li>To understand how to connect to the inference workload, see Inference Quickstart.</li> <li>To learn more about Inference and Run:ai see Inference overview.</li> </ul>"},{"location":"developer/cluster-api/workload-overview-dev/","title":"Workloads Overview","text":""},{"location":"developer/cluster-api/workload-overview-dev/#workloads","title":"Workloads","text":"<p>Run:ai schedules Workloads. Run:ai workloads contain:</p> <ul> <li>The Kubernetes resource (Job, Deployment, etc) that is used to launch the container inside which the data science code runs. </li> <li>A set of additional resources that is required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network and more. </li> </ul> <p>Run:ai supports the following Workloads types:</p> Workload Type Kubernetes Name Description Interactive <code>InteractiveWorkload</code> Submit an interactive workload Training <code>TrainingWorkload</code> Submit a training workload Distributed Training <code>DistributedWorkload</code> Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference <code>InferenceWorkload</code> Submit an inference workload"},{"location":"developer/cluster-api/workload-overview-dev/#values","title":"Values","text":"<p>A Workload will typically have a list of values, such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.  </p> <p>You can also find the exact YAML syntax run:</p> <pre><code>kubectl explain TrainingWorkload.spec\n</code></pre> <p>(and similarly for other Workload types).</p> <p>To get information on a specific value (e.g. <code>node type</code>), you can also run:</p> <pre><code>kubectl explain TrainingWorkload.spec.nodeType\n</code></pre> <p>Result:</p> <pre><code>KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: nodeType &lt;Object&gt;\n\nDESCRIPTION:\n     Specifies nodes (machines) or a group of nodes on which the workload will\n     run. To use this feature, your Administrator will need to label nodes as\n     explained in the Group Nodes guide at\n     https://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\n     can be used in conjunction with Project-based affinity. In this case, the\n     flag is used to refine the list of allowable node groups set in the\n     Project. For more information consult the Projects guide at\n     https://docs.run.ai/admin/admin-ui-setup/project-setup.\n\nFIELDS:\n   value    &lt;string&gt;\n</code></pre>"},{"location":"developer/cluster-api/workload-overview-dev/#how-to-submit","title":"How to Submit","text":"<p>A Workload can be submitted via various channels:</p> <ul> <li>The Run:ai user interface.</li> <li>The Run:ai command-line interface, via the runai submit command.</li> <li>The Run:ai Cluster API.</li> </ul>"},{"location":"developer/cluster-api/workload-overview-dev/#policies","title":"Policies","text":"<p>An Administrator can set Policies for Workload submission. Policies serve two purposes:</p> <ol> <li>To constrain the values a researcher can specify.</li> <li>To provide default values.</li> </ol> <p>For example, an administrator can,</p> <ul> <li>Set a maximum of 5 GPUs per Workload. </li> <li>Provide a default value of 1 GPU for each container. </li> </ul> <p>Each workload type has a matching kind of workload policy. For example, an <code>InteractiveWorkload</code> has a matching <code>InteractivePolicy</code></p> <p>A Policy of each type can be defined per-project. There is also a global policy that applies to any project that does not have a per-project policy.</p> <p>For further details on policies, see Policies.</p>"},{"location":"developer/cluster-api/reference/distributed/","title":"Distributed Training Workload Parameters","text":"<p>Following is a full list of all distributed workload parameters. The text below is equivalent to running <code>kubectl explain distributedworkload.spec</code>. You can also run <code>kubectl explain distributedworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     DistributedWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this DistributedWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\njobType  &lt;string&gt;\nThe type of distributed job\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpiJob   &lt;Object&gt;\nSpecific fields for distributed MPI Job\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nnonPreemptible   &lt;Object&gt;\nRequest the job to be non-preemptible\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\npyTorchJob   &lt;Object&gt;\nSpecific fields for distributed PyTorch Job\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\nrunPolicy    &lt;Object&gt;\nRunPolicy is shared between all distributed jobs\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkers  &lt;Object&gt;\nThe desired number of worker pods.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/inference/","title":"Inference Workload Parameters","text":"<p>Following is a full list of all inference workload parameters. The text below is equivalent to running <code>kubectl explain inferenceworkload.spec</code>. You can also run <code>kubectl explain inferenceworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     InferenceWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this workload\nFIELDS:\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\nclass    &lt;Object&gt;\nThe autoscaler class for knative to use\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\nisPrivateServiceUrl  &lt;Object&gt;\nConfigure the inference service to be available only on the cluster-local\nnetwork, and not on the public internet\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmaxScale &lt;Object&gt;\nThe maximum number of replicas to run\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmetric   &lt;Object&gt;\nThe predefined metric to use for autoscaling. Possible values are:\ncpu-utilization, latency, throughput, concurrency, gpu-utilization, custom.\nmetricName   &lt;Object&gt;\nThe exact metric name to use for autoscaling (overrides Metric field)\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nminScale &lt;Object&gt;\nThe minimum number of replicas to run\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntarget   &lt;Object&gt;\nThe target value for the autoscaling metric\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/interactive/","title":"Interactive Workload Parameters","text":"<p>Following is a full list of all interactive workload parameters. The text below is equivalent to running <code>kubectl explain interactiveworkload.spec</code>. You can also run <code>kubectl explain interactiveworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     InteractiveWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this InteractiveWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\njupyter  &lt;Object&gt;\nIndication if an interactive workload should run jupyter notebook\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpi  &lt;Object&gt;\nThis workload produces mpijob\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nnotebookToken    &lt;Object&gt;\nA token for connecting to a Jupyter Notebook created for workloads of type\nJupyter. When token authentication is enabled, the notebook uses this token\nto authenticate requests. For more information see:\nhttps://jupyter-notebook.readthedocs.io/en/stable/security.html\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npreemptible  &lt;Object&gt;\nSpecifies that the created workload will be preemptible. Interactive\npreemptible workloads can be scheduled above the guaranteed quota but may\nbe reclaimed at any time.\nprocesses    &lt;Object&gt;\nNumber of distributed training processes that will be allocated for the\ncreated mpijob.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nslotsPerWorker   &lt;Object&gt;\nNumber of slots to allocate per worker in the created mpijob.\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntensorboard  &lt;Object&gt;\nIndicates that this interactive workload should also run a TensorBoard\ndashboard\ntensorboardLogdir    &lt;Object&gt;\nThe TensorBoard Logs directory\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/training/","title":"Training Workload Parameters","text":"<p>Following is a full list of all training workload parameters. The text below is equivalent to running <code>kubectl explain trainingworkload.spec</code>. You can also run <code>kubectl explain trainingworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter. </p> <pre><code>KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this TrainingWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbackoffLimit &lt;Object&gt;\nSpecifies the number of retries before marking a workload as failed.\nDefaults to 6\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncompletions  &lt;Object&gt;\nUsed with Hyperparameter Optimization. Specifies the number of successful\npods the job should reach to be completed. The Job will be marked as\nsuccessful once the specified amount of pods has succeeded. The default\nvalue for 'completions' is 1. The 'parallelism' flag should be smaller or\nequal to 'completions' For more information see:\nhttps://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpi  &lt;Object&gt;\nThis workload produces mpijob\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nparallelism  &lt;Object&gt;\nSpecifies the maximum desired number of pods the workload should run at any\ngiven time. The actual number of pods running in a steady state will be\nless than this number when ((.spec.completions - .status.successful) &lt;\n.spec.parallelism), i.e. when the work left to do is less than max\nparallelism. For more information, see:\nhttps://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\nprocesses    &lt;Object&gt;\nNumber of distributed training processes that will be allocated for the\ncreated mpijob.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nslotsPerWorker   &lt;Object&gt;\nNumber of slots to allocate per worker in the created mpijob.\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\nttlAfterFinish   &lt;Object&gt;\nSpecifies the duration after which it is possible for a finished workload\nto be automatically deleted. When the workload is being deleted, its\nlifecycle guarantees (e.g. finalizers) will be honored. If this field is\nunset, the workload won't be automatically deleted. If this field is set to\nzero, the workload becomes eligible to be deleted immediately after it\nfinishes. This field is alpha-level and is only honored by servers that\nenable the TTLAfterFinished feature.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/deprecated/inference/overview/","title":"Overview","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement. To read more about Inference see the new Inference Overview.</p>"},{"location":"developer/deprecated/inference/overview/#what-is-inference","title":"What is Inference","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"developer/deprecated/inference/overview/#inference-and-gpus","title":"Inference and GPUs","text":"<p>The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process. </p> <p>Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory. </p>"},{"location":"developer/deprecated/inference/overview/#inference-runai","title":"Inference @Run:ai","text":"<p>Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.</p> <ul> <li> <p>Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.</p> </li> <li> <p>Inference is implemented as a Kubernetes Deployment with a defined number of replicas. The replicas are load-balanced by Kubernetes so that adding more replicas will improve the overall throughput of the system.</p> </li> <li> <p>Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.</p> </li> <li> <p>Inference workloads can be submitted via Run:ai Command-line interface as well as Kubernetes API/YAML. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect. </p> </li> </ul>"},{"location":"developer/deprecated/inference/overview/#see-also","title":"See Also","text":"<ul> <li>To setup Inference, see Inference Setup</li> <li>For running Inference see Inference quick-start</li> </ul>"},{"location":"developer/deprecated/inference/setup/","title":"Inference Setup","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement.</p> <p>Inference Jobs are an integral part of Run:ai and do not require setting up per se. However, Running multiple production-grade processes on a single GPU is best performed with an NVIDIA technology called Multi-Process Service or MPS</p> <p>By default, MPS is not enabled on GPU nodes.</p>"},{"location":"developer/deprecated/inference/setup/#enable-mps","title":"Enable MPS","text":"<p>To enable the MPS server on all nodes, you must edit the cluster installation values file:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> </ul> <p>Use:</p> <pre><code>runai-operator:\nconfig:\nmps-server:\nenabled: true\n</code></pre> <p>Wait for the MPS server to start running:</p> <pre><code> kubectl get pods -n runai\n</code></pre> <p>When the MPS server pod has started to run, restart the <code>nvidia-device-plugin</code> pods:</p> <pre><code>kubectl rollout restart ds/nvidia-device-plugin-daemonset -n gpu-operator\n</code></pre> <p>To enable the MPS server on selected nodes, please contact Run:ai customer support.</p>"},{"location":"developer/deprecated/inference/setup/#verify-mps-is-enabled","title":"Verify MPS is Enabled","text":"<p>Run:</p> <pre><code>kubectl get pods -n runai --selector=app=runai-mps-server -o wide\n</code></pre> <ul> <li> <p>Verify that all mps-server pods are in <code>Running</code> state. </p> </li> <li> <p>Submit a workload with MPS enabled using the --mps flag.  Then run:</p> </li> </ul> <pre><code>runai list\n</code></pre> <ul> <li>Identify the node on which the workload is running. In the <code>get pods</code> command above find the pod running on the same node and then run: </li> </ul> <pre><code>kubectl logs -n runai runai-mps-server-&lt;name&gt; -f\n</code></pre> <p>You should see activity in the log </p>"},{"location":"developer/deprecated/inference/submit-via-cli/","title":"Submit an inference Workload","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement.</p> <p>The easiest way to submit a new Inference workload is using the Run:ai Command-line interface. For additional information see the Inference Quickstart documentation.</p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-kubernetes-api/","title":"Submit a Run:ai Job via Kubernetes API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>This article is a complementary article to the article on launching jobs via YAML. It shows how to use a programming language and Kubernetes API to submit jobs. </p> <p>The article uses Python, though Kubernetes API is available in several other programming languages. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-kubernetes-api/#submit-a-runai-job","title":"Submit a Run:ai Job","text":"<pre><code>from __future__ import print_function\nimport kubernetes\nfrom kubernetes import client, config\nfrom pprint import pprint\nimport json\nconfig.load_kube_config()\nwith client.ApiClient() as api_client:\nnamespace = 'runai-team-a'  # Run:ai project name is prefixed by runai-\njobname = 'my-job'\nusername = 'john'  # used in un-authenticated systems only\ngpus = 1\nbody = client.V1Job(api_version=\"run.ai/v1\", kind=\"RunaiJob\")\nbody.metadata = client.V1ObjectMeta(namespace=namespace, name=jobname)\ntemplate = client.V1PodTemplate()\ntemplate.template = client.V1PodTemplateSpec()\ntemplate.template.metadata = client.V1ObjectMeta(labels = {'user' : username})\nresource = client.V1ResourceRequirements(limits= {'nvidia.com/gpu' : gpus})\ncontainer = client.V1Container(\nname=jobname, image='gcr.io/run-ai-demo/quickstart', resources=resource)\ntemplate.template.spec = client.V1PodSpec(\ncontainers=[container], restart_policy='Never', scheduler_name='runai-scheduler')\nbody.spec = client.V1JobSpec(template=template.template)\npprint(body)\ntry:\napi_instance = client.CustomObjectsApi(api_client)\napi_response = api_instance.create_namespaced_custom_object(\n\"run.ai\", \"v1\", namespace, \"runaijobs\", body)\npprint(api_response)\nexcept client.rest.ApiException as e:\nprint(\"Exception when calling AppsV1Api-&gt;create_namespaced_job: %s\\n\" % e)\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/","title":"Submit a Run:ai Job via YAML","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>You can use YAML files to submit jobs directly to Kubernetes. A frequent scenario for using the Kubernetes YAML syntax to submit Jobs is integrations. Researchers may already be working with an existing system that submits Jobs, and want to continue working with the same system. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#terminology","title":"Terminology","text":"<p>We differentiate between three types of Workloads:</p> <ul> <li>Train workloads. Train workloads are characterized by a deep learning session that has a start and an end. A Training session can take anywhere from a few minutes to a couple of weeks. It can be interrupted in the middle and later restored. Training workloads typically utilize large percentages of GPU computing power and memory.</li> <li>Build workloads. Build workloads are interactive. They are used by data scientists to write machine learning code and test it against subsets of the data. Build workloads typically do not maximize usage of the GPU. </li> <li>Inference workloads. Inference workloads are used for serving models in production. For details on how to submit Inference workloads via YAML see here.</li> </ul> <p>The internal Kubernetes implementation of a Run:ai Job is a CRD (Customer Resource) named <code>RunaiJob</code> which is similar to a Kubernetes Job. </p> <p>Run:ai extends the Kubernetes Scheduler. A Kubernetes Scheduler is the software that determines which workload to start on which node. Run:ai provides a custom scheduler named <code>runai-scheduler</code>.</p> <p>The Run:ai scheduler schedules computing resources by associating Workloads with  Run:ai Projects:</p> <ul> <li>A Project is assigned with a GPU quota through the Run:ai Run:ai User Interface. </li> <li>A workload must be associated with a Project name and will receive resources according to the defined quota for the Project and the currently running Workloads</li> </ul> <p>Internally, Run:ai Projects are implemented as Kubernetes namespaces. The scripts below assume that the code is being run after the relevant namespace has been set. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#submit-workloads","title":"Submit Workloads","text":"<ul> <li><code>&lt;JOB-NAME&gt;</code>. The name of the Job. </li> <li><code>&lt;IMAGE-NAME&gt;</code>. The name of the docker image to use. Example: <code>gcr.io/run-ai-demo/quickstart</code>.</li> <li><code>&lt;USER-NAME&gt;</code>. The name of the user submitting the Job. The name is used for display purposes only when Run:ai is installed in an unauthenticated mode.</li> <li><code>&lt;REQUESTED-GPUs&gt;</code>. An integer number of GPUs you request to be allocated for the Job. Examples: 1, 2.</li> <li><code>&lt;NAMESAPCE&gt;</code>. The name of the Project's namespace. This is usually <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> </ul>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#regular-jobs","title":"Regular Jobs","text":"<p>Copy the following into a file and change the parameters:</p> <pre><code>apiVersion: run.ai/v1\nkind: RunaiJob (* see note below)\nmetadata:\nname: &lt;JOB-NAME&gt;\nnamespace: &lt;NAMESPACE&gt;\nlabels:\npriorityClassName: \"build\" (* see note below)\nspec:\ntemplate:\nmetadata:\nlabels:\nuser: &lt;USER-NAME&gt;\nspec:\ncontainers:\n- name: &lt;JOB-NAME&gt;\nimage: &lt;IMAGE-NAME&gt;\nresources:\nlimits:\nnvidia.com/gpu: &lt;REQUESTED-GPUs&gt;\nrestartPolicy: Never\nschedulerName: runai-scheduler\n</code></pre> <p>To submit the job, run:</p> <pre><code>kubectl apply -f &lt;FILE-NAME&gt;\n</code></pre> <p>Note</p> <ul> <li>You can use either a regular <code>Job</code> or <code>RunaiJob</code>. The latter is a Run:ai object which solves various Kubernetes Bugs and provides a better naming for multiple pods in Hyper-Parameter Optimization scenarios</li> <li>Using <code>build</code> in the <code>priorityClassName</code> field is equivalent to running a job via the CLI with a '--interactive' flag. To run a Train job, delete this line.</li> <li>The runai submit CLI command includes many more flags. These flags can be correlated with Kubernetes API functions and added to the YAML above. </li> </ul>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#using-fractional-gpus","title":"Using Fractional GPUs","text":"<p>To submit a Job with fractions of a GPU, replace <code>&lt;REQUESTED-GPUs&gt;</code> with a fraction in quotes. e.g. </p> <pre><code>limits:\nnvidia.com/gpu: \"0.5\"\n</code></pre> <p>where \"0.5\" is the requested GPU fraction.</p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#mapping-additional-flags","title":"Mapping Additional Flags","text":"<p>Run:ai Command-Line <code>runai submit</code> has a significant number of flags. The easiest way to find out the mapping from a flag to the correct YAML attribute is to use the <code>--dry-run</code> flag.</p> <p>For example, to find the location of the <code>--large-shm</code> flag, run:</p> <pre><code>&gt; runai submit -i ubuntu --large-shm --dry-run\nTemplate YAML file can be found at:\n/var/folders/xb/rnf9b1bx2jg45c7jprv71d9m0000gn/T/job.yaml185826190\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#delete-workloads","title":"Delete Workloads","text":"<p>To delete a Run:ai workload, delete the Job:</p> <pre><code>kubectl delete runaijob &lt;JOB-NAME&gt;\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#see-also","title":"See Also","text":"<ul> <li>See how to use the above YAML syntax with Kubernetes API</li> <li>Use the Researcher REST API to submit, list and delete Jobs.</li> </ul>"},{"location":"developer/deprecated/k8s-api/overview/","title":"Overview: Launch a Job via Kubernetes API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>You can create, submit, list or delete jobs using the Command-line interface or the Run:ai User Interface. </p> <p>To do the same programmatically you can use the Run:ai Researcher REST API. </p> <p>You can also communicate directly with the underlying Kubernetes infrastructure by:</p> <ul> <li>Using YAML files or,</li> <li>By using a variety of programming languages to send requests to Kubernetes. See Submit a Run:ai Job via Kubernetes API for a python sample.</li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/","title":"Researcher REST API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>The purpose of the Researcher REST API is to provide an easy-to-use programming interface for submitting, listing, and deleting Jobs. </p> <p>There are other APIs that provide the same functionality. Specifically:</p> <ul> <li>If your code is script-based, you may consider using the Run:ai command-line interface.</li> <li>You can communicate directly with the underlying Kubernetes infrastructure by sending YAML files or by using a variety of programming languages to send requests to Kubernetes. See Submit a Run:ai Job via Kubernetes API.</li> </ul> <p>The Researcher REST API is cluster-specific in the sense that if you have multiple GPU clusters, you will have a separate URL per cluster. This <code>&lt;CLUSTER-ENDPOINT&gt;</code> can be found in the Run:ai User Interface, under <code>Clusters</code>. Each cluster will have a separate URL.</p>"},{"location":"developer/deprecated/researcher-rest-api/overview/#authentication","title":"Authentication","text":"<ul> <li>By default, researcher APIs are unauthenticated. To protect researcher API, you must configure researcher authentication.</li> <li>Once configured, you must create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<code>&lt;ACCESS-TOKEN&gt;</code>). For details, see Calling REST APIs.</li> <li>Use the token for subsequent API calls. </li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/#example","title":"Example","text":"<p>Get all the jobs for a project named <code>team-a</code>: </p> <pre><code>curl  'https://&lt;CLUSTER-ENDPOINT&gt;/researcher/api/v1/jobs/team-a' \\\n-H 'accept: application/json' \\\n--header 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' </code></pre>"},{"location":"developer/deprecated/researcher-rest-api/overview/#researcher-api-scope","title":"Researcher API Scope","text":"<p>The Researcher API provides the following functionality:</p> <ul> <li>Submit a new Job</li> <li>List jobs for specific Projects.</li> <li>Delete an existing Job</li> <li>Get a list of Projects for which you have access to</li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/#researcher-api-documentation","title":"Researcher API Documentation","text":"<p>To review API documentation:</p> <ul> <li>Open the Run:ai user interface</li> <li>Go to <code>Clusters</code></li> <li>Locate your cluster and browse to <code>https://&lt;cluster-url&gt;/researcher/api/docs</code>.</li> <li>When using the <code>Authenticate</code> button, add <code>Bearer &lt;ACCESS TOKEN&gt;</code> (simply adding the access token will not work).</li> </ul> <p>The document uses the Open API specification to describe the API. You can test the API within the document after creating and saving a token.</p>"},{"location":"developer/metrics/metrics/","title":"Metrics","text":""},{"location":"developer/metrics/metrics/#what-are-metrics","title":"What are Metrics","text":"<p>Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface. </p> <p>The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems. </p> <p>Run:ai uses Prometheus for collecting and querying metrics.</p>"},{"location":"developer/metrics/metrics/#published-runai-metrics","title":"Published Run:ai Metrics","text":"<p>Following is the list of published Run:ai metrics:</p> Metric name Labels Measurement Description runai_active_job_cpu_requested_cores {clusterId,  job_name, job_uuid} CPU Cores Job's requested CPU cores runai_active_job_memory_requested_bytes {clusterId,  job_name, job_uuid} Bytes Job's requested CPU memory runai_cluster_cpu_utilization \u00a0{clusterId} 0 to 1 CPU utilization of the entire cluster runai_cluster_memory_used_bytes \u00a0{clusterId} Bytes Used CPU memory of the entire cluster runai_cluster_memory_utilization \u00a0{clusterId} 0 to 1 CPU memory utilization of the entire cluster runai_gpu_is_allocated \u00a0{gpu, clusterId, node} 0/1 Is a GPU hosting a pod runai_gpu_is_running_fractional_job \u00a0{gpu, clusterId, node} 0/1 Is GPU hosting Fractional GPU jobs runai_gpu_last_active_time \u00a0{gpu, clusterId, node} Unix time Last time GPU was not idle runai_gpu_utilization_non_fractional_jobs \u00a0{job_uuid, job_name, clusterId, gpu, node} 0 to 100 Utilization per GPU for jobs running on a full GPU runai_gpu_utilization_with_pod_info \u00a0{pod_namespace, pod_name, clusterId, gpu, node} 0 to 100 GPU utilization per GPU runai_job_allocated_gpus \u00a0{job_type, job_uuid, job_name, clusterId, project} Double GPUs allocated to Jobs runai_job_gpu_utilization \u00a0{job_uuid, clusterId, job_name, project} 0 to 100 Average GPU utilization per job runai_job_image \u00a0{image, job_uuid, job_name, clusterId} N/A Image name per job runai_job_requested_gpu_memory \u00a0{job_type, job_uuid, job_name, clusterId, project} Bytes Requested GPU memory per job (0 if not specified by the user) runai_job_requested_gpus \u00a0{job_type, job_uuid, job_name, clusterId, project} Double Number of requested GPU per job runai_job_status_with_info \u00a0{user, job_type, status, job_name, clusterId, node, project} N/A Additional information on jobs runai_job_total_runtime \u00a0{clusterId, job_uuid} Seconds Total run time per job runai_job_total_wait_time \u00a0{clusterId, job_uuid} Seconds Total wait time per job runai_job_used_gpu_memory_bytes \u00a0{clusterId, job_uuid} Bytes Used GPU memory per job runai_job_used_gpu_memory_bytes_with_gpu_node \u00a0{job_uuid, job_name, clusterId, gpu, node} Bytes Used GPU memory per job, per GPU on which the job is running runai_node_cpu_requested_cores \u00a0{clusterId, node} Double Sum of the requested CPU cores of all jobs running in a node runai_node_cpu_utilization \u00a0{clusterId, node} 0 to 1 CPU utilization per node runai_node_gpu_used_memory_bytes \u00a0{clusterId, node} Bytes Used GPU memory per node runai_node_memory_utilization \u00a0{clusterId, node} 0 to 1 CPU memory utilization per node runai_node_requested_memory_bytes \u00a0{clusterId, node} Bytes Sum of the requested CPU memory of all jobs running in a node runai_node_total_memory_bytes \u00a0{clusterId, node} Bytes Total GPU memory per node runai_node_used_memory_bytes \u00a0{clusterId, node} Bytes Used CPU memory per node runai_project_guaranteed_gpus \u00a0{clusterId, project} Double Guaranteed GPU quota per project runai_project_info \u00a0{memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department_name} N/A Information on CPU, CPU memory, GPU quota per project runai_running_job_cpu_limit_cores {clusterId, job_name , job_uuid} Double Jobs CPU limit (in number of cores). See link runai_running_job_cpu_requested_cores \u00a0{clusterId, job_name, job_uuid} Double Jobs requested CPU cores. See link runai_running_job_cpu_used_cores \u00a0{job_uuid, clusterId, job_name, project} Double Jobs CPU usage (in number of cores) runai_running_job_memory_limit_bytes \u00a0{clusterId, job_name, job_uuid} Bytes Jobs CPU memory limit. See link runai_running_job_memory_requested_bytes \u00a0{clusterId, job_name, job_uuid} Bytes Jobs requested CPU memory. See link runai_running_job_memory_used_bytes \u00a0{job_uuid, clusterId, job_name, project} Bytes Jobs used CPU memory runai_mig_mode_gpu_count \u00a0{clusterId, node} Double Number of GPUs on MIG nodes runai_job_swap_memory_used_bytes {clusterId, job_uuid, job_name, project, node} Bytes Used Swap CPU memory for the job runai_deployment_request_rate {clusterId, namespace_name, deployment_name} Number Rate of received HTTP requests per second runai_deployment_request_latencies {clusterId, namespace_name, deployment_name, le} Number Histogram of response time (bins are in milliseconds) <p>Additional metrics for version 2.9 and above</p> Metric name Labels Measurement Description runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Toal GPU memory per Node runai_gpu_count_per_node {clusterId, node} Number Number of GPUs per Node runai_allocated_gpu_count_per_workload {clusterId, job_name, job_uuid, job_type, user} Double Number of allocated GPUs per Workload runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod per Gpu runai_gpu_memory_used_mebibytes_per_workload {clusterId, job_name, job_uuid, job_type, user} MiB Used GPU Memory per Workload runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU runai_gpu_utilization_per_workload {clusterId, job_name, job_uuid, job_type, user} % GPU Utilization per Workload runai_gpu_utilization_per_project {clusterId, project} % GPU Utilization per Project runai_last_gpu_utilization_time_per_workload {clusterId, job_name, job_uuid, job_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of His Allocated GPUs runai_gpu_idle_time_per_workload {clusterId, job_name, job_uuid, job_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of His Allocated GPUs runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod <p>Following is a list of labels appearing in Run:ai metrics:</p> Label Description clusterId Cluster Identifier department_name Name of Run:ai Department cpu_quota CPU limit per project gpu GPU index gpu_guaranteed_quota Guaranteed GPU quota per project image Name of Docker image namespace_name Namespace deployment_name Deployment name job_name Job name job_type Job type: training, interactive or inference job_uuid Job identifier pod_name Pod name. A Job can contain many pods. pod_namespace Pod namespace memory_quota CPU memory limit per project node Node name project Name of Run:ai Project status Job status: Running, Pending, etc. For more information on Job statuses see document user User identifier"},{"location":"developer/metrics/metrics/#other-metrics","title":"Other Metrics","text":"<p>Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:</p> Metric name Description dcgm_gpu_utilization GPU utilization kube_node_status_allocatable Resources (CPU, memory, GPU etc) are allocatble (available for scheduling) kube_node_status_capacity The capacity for different resources of a node kube_node_status_condition The condition of a cluster node kube_pod_container_resource_requests The number of requested resources by a container kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container kube_pod_info Information about pod kube_pod_status_phase The current phase of the pod <p>For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.</p>"},{"location":"developer/metrics/metrics/#create-custom-dasbhoards","title":"Create custom dasbhoards","text":"<p>To create custom dashboards based on the above metrics, please contact Run:ai customer support.</p>"},{"location":"home/components/","title":"Run:ai System Components","text":""},{"location":"home/components/#components","title":"Components","text":"<ul> <li> <p>Run:ai is installed over a Kubernetes Cluster</p> </li> <li> <p>Researchers submit Machine Learning workloads via the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes. </p> </li> <li> <p>Administrators monitor and set priorities via the Run:ai User Interface</p> </li> </ul> <p></p>"},{"location":"home/components/#the-runai-cluster","title":"The Run:ai Cluster","text":"<p>The Run:ai Cluster contains:</p> <ul> <li>The Run:ai Scheduler which extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers. </li> <li>Fractional GPU management. Responsible for the Run:ai technology which allows Researchers to allocate parts of a GPU rather than a whole GPU  </li> <li>The Run:ai agent. Responsible for sending Monitoring data to the Run:ai Cloud.</li> <li>Clusters require outbound network connectivity to the Run:ai Cloud.  </li> </ul>"},{"location":"home/components/#kubernetes-related-details","title":"Kubernetes-Related Details","text":"<ul> <li>The Run:ai cluster is installed as a Kubernetes Operator</li> <li>Run:ai is installed in its own Kubernetes namespace named runai</li> <li>Workloads are run in the context of Projects. Each Project is a Kubernetes namespace with its own settings and access control. </li> </ul>"},{"location":"home/components/#the-runai-control-plane","title":"The Run:ai Control Plane","text":"<p>The Run:ai control plane is the basis of the Run:ai User Interface. </p> <ul> <li>The Run:ai cloud aggregates monitoring information from multiple tenants (customers).</li> <li>Each customer may manage multiple Run:ai clusters. </li> </ul> <p></p> <p>The Run:ai control plane resides on the cloud but can also be locally installed. To understand the various installation options see the installation types document.</p>"},{"location":"home/data-privacy-details/","title":"Data Privacy","text":"<p>Run:ai SaaS Cluster installation uses the Run:ai cloud as its control plane. The cluster sends information to the cloud for the purpose of control as well as dashboards. The document below is a run-down of the data that is being sent to the Run:ai cloud.</p> <p>Note</p> <p>If the data detailed below is not in line with your organization's policy, you can choose to install the Run:ai self-hosted version. The self-hosted installation includes the Run:ai control-plane and will not communicate with the cloud. The self-hosted installation has different pricing. </p>"},{"location":"home/data-privacy-details/#data","title":"Data","text":"<p>Following is a list of platform data items that are sent to the Run:ai cloud.</p> Asset Data Details Job Metrics Job names, CPU, GPU, and Memory metrics, parameters sent using the <code>runai submit</code> command Node Metrics Node names and IPs, CPU, GPU, and Memory metrics Cluster Metrics Cluster names, CPU, GPU, and Memory metrics Projects &amp; Departments Names, quota information Users User Run:ai roles, emails and passwords (when single-sign on not used) <p>Run:ai does not send deep-learning artifacts to the cloud. As such any Code, images, container logs, training data, models, checkpoints and the like, stay behind corporate firewalls. </p>"},{"location":"home/data-privacy-details/#see-also","title":"See Also","text":"<p>The Run:ai privacy policy. </p>"},{"location":"home/whats-new-2-10/","title":"Run:ai version 2.10","text":""},{"location":"home/whats-new-2-10/#version-21013","title":"Version 2.10.13","text":"Internal ID Description RUN-9383 Fixed an issue with a Bad Gateway login due to a browser cookie issue."},{"location":"home/whats-new-2-10/#version-21011","title":"Version 2.10.11","text":"<p>Added</p> Internal ID Description RUN-9920 Fixed an issue with policies where the <code>canEdit</code> rule is not validated properly for itemized fields. RUN-9912 Fixed an issue where <code>runai bash</code> does not wait for pods to be ready. RUN-8982 Fixed an issue with missing images for airgapped installations. ## Version 2.10.10 Internal ID Description RUN-9894 Fixed an issue where you cannot delete project in cluster if its not attached to a namespace. RUN-9039 Fixed and issue where in the job screen, after selecting a project, the preemptible flag is toggled from on to off and submit a job, the screen still shows the job as interactive-preemptible."},{"location":"home/whats-new-2-10/#version-2109","title":"Version 2.10.9","text":""},{"location":"home/whats-new-2-10/#release-date","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-9488 Fixed certificate error when retrieving dashboards from an external URL in self-hosted environment using a self-signed certificate. RUN-9462 Fixed scenarios where <code>supplementary groups</code> were not passed from the identity provider to the container. RUN-9278 Fixed issue in openshift environments where projects deleted from the UI are still listed in the CLI."},{"location":"home/whats-new-2-10/#version-2108","title":"Version 2.10.8","text":""},{"location":"home/whats-new-2-10/#release-date_1","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-9250 Fixed trimming of wrong characters from API server URL in the CLI command <code>runai portfoward</code>. RUN-9579 Fixed nodepool priority assignment and persistance in API and UI. RUN-9590 Fixed PVC propagation not working due to improper handling of shared PVCs' annotations. RUB-9631 Added flag <code>--pod-running-timeout</code> to CLI commands <code>runai attach</code> and <code>runai bash</code>."},{"location":"home/whats-new-2-10/#version-2107","title":"Version 2.10.7","text":""},{"location":"home/whats-new-2-10/#release-date_2","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN_9226 Fixed implementation of MIG utilization metrics in graphs. RUN-9309 Changed the <code>backoffLimit</code> default for distributed workloads from 0 to 6 * (the number of workers). RUN-9324 Fixed volume capacity check on PVC when its not immediately bound."},{"location":"home/whats-new-2-10/#version-2106","title":"Version 2.10.6","text":""},{"location":"home/whats-new-2-10/#release-date_3","title":"Release date","text":"<p>April 2023</p>"},{"location":"home/whats-new-2-10/#release-content","title":"Release content","text":"<p>runai port-forward</p> <p>The <code>port forward</code> CLI command can forward ports to any of the pods in a job.</p> <p>The <code>port forward</code> CLI command now includes the <code>pod-running-timeout</code> flag. This determines how long the command will wait for the pod to run before it times out.  The default is 10 minutes.</p> <p>Scheduler</p> <p>Corrected scheduler message about availability of \"other resources\".</p> <p>Jobs</p> <p>Fixed the output of <code>runai describe job</code> for jobs without pods.</p> <p>Cluster wide PVC</p> <p>Cluster wide PVC is now replicated to namespaces that do not have an existing PVC with the same name.</p>"},{"location":"home/whats-new-2-10/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-9196 Fixed dashboard overview displaying <code>running_workloads:cpu_only</code> rule. RUN-9256 Now supports the global configuration of memory request of memory-sensitive pods in the cluster. RUN-9219 Fixed <code>runai describe</code> on pytorch outputs \"Is Distributed Workload: false\". RUN-9221 Fixed CLI <code>runai describe</code> job nil pointer exception. RUN-9220 Fixed PVC duplication errors so that it does not duplicate for namespaces with the same PVC name and bound PVCs. RUN-9224 Fixed Scheduler not reporting the correct event on EFA (status history). RUN-9189 Improved Scheduler performance to reclaim action slowness in really big clusters. RUN-450 Change \"edit boxes\" to labels. RUN-9218 Added support for <code>pod-running-timeout</code> when using <code>runai port-forward</code>. RUN-9252 Fixed <code>runai port-forward</code> to be consistent with <code>runai bash</code> (<code>--target</code> is now <code>--pod</code>). RUN-9071 Fixed registries api call crashing the ui when returning an error. RUN-8794 Newer dashboards are now deployed for tenants using grafanlabs. RUN-9212 Fixed filter jobs by type. As a workaround, you can also you can sort by type."},{"location":"home/whats-new-2-10/#version-2105","title":"Version 2.10.5","text":""},{"location":"home/whats-new-2-10/#release-date_4","title":"Release date","text":"<p>April 2023</p>"},{"location":"home/whats-new-2-10/#release-content_1","title":"Release content","text":""},{"location":"home/whats-new-2-10/#authentication-and-access-control","title":"Authentication and access control","text":"<p>Credential Manager</p> <p>This feature provides configuration for credentials that are used to unlock protected resources such as applications, containers, and other assets. For configuration information, see Credentials.</p> <p>SSO custom URL logout</p> <p>This feature configures a custom logout URL in your tenant. For configuration information, see SSO UI Configuration.</p> <p>Department Administrator</p> <p>The new role of Department Administrator adds a layer of delegation in the administration of departments. For an explanation of the role, see Create a user. For Department configuration information, see Assigning the Department Administrator role.</p> <p>Enable SSO Using OIDC</p> <p>Added an additional SSO configuration option using OIDC as the identity provider. For configuration information, see SSO UI Configuration.</p> <p>Inactivity timeout</p> <p>Added inactivity timeout for automated logout. The inactivity timeout is configured in minutes. For configuration information, see Inactivity timeout.</p>"},{"location":"home/whats-new-2-10/#researcher-tools","title":"Researcher tools","text":"<p>PyTorch</p> <p>Added CLI support for submitting PyTorch jobs. For more information, see Submit Run:ai PyTorch job.</p> <p>TensorFlow</p> <p>Added CLI support for submitting TensorFlow jobs. For more information, see Submit Run:ai TensorFlow job.</p> <p>Cron jobs support</p> <p>Added support for cron command-line job scheduler. For more information, see Submit CRON job via YAML.</p> <p>Previous jobs menu</p> <p>The option to re-run a job is supported via the <code>Clone Job</code> action in the <code>Jobs</code> screen. The option to select a previous job in the \"New Job\" form is no longer supported</p> <p>Annotations and labels</p> <p>Added to the UI the capability to add Kubernetes annotations and labels to the new job form.</p>"},{"location":"home/whats-new-2-10/#scheduling","title":"Scheduling","text":"<p>Bin-Packing or Spread CPU scheduling strategy</p> <p>The administrator can set a cluster-wide scheduling parameter to determine if the scheduler should spread or bin-pack workloads. Added a new distinct parameter for pure CPU workloads so administrators can use different strategies for different workloads. For more information, see Scheduling Strategies</p> <p>Scheduling workloads to AWS placement groups</p> <p>Added feature to leverage Placement Groups within AWS to maximize throughput and performance of distributed training workloads. For more information, see Scheduling workloads to AWS placement groups.</p> <p>Job Status Notifications</p> <p>Added the capability to send job statuses notifications to Slack. For configuration information, see Event Router.</p>"},{"location":"home/whats-new-2-10/#storage","title":"Storage","text":"<p>Cluster wide PVC</p> <p>Added the ability to use cluster wide PVCs. A PersistentVolumeClaim (PVC) is a request for storage by a user and is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes. For PVC configuration, see Setting up cluster wide PVCs.</p> <p>Ephemeral PVC</p> <p>Added support Ephemeral PVC in CLI and in the job submission form. For more information, see CLI reference runai submit.</p>"},{"location":"home/whats-new-2-10/#known-issues","title":"Known issues","text":"Internal ID Description Workaround RUN-8695 SSO users that logged in via SAML can't login again after disabling and reenabling SSO. RUN-8680 A user in an OCP group with roles that belong to that group should be able to submit a job from the UI. RUN-8601 Warning when the CLI command <code>runai suspend</code> is used. RUN-8422 Remove Knative unnecessary requests when inference is not enabled. RUN-7874 A new job returns <code>malformed URL</code> when a project is not connected to a namespace. RUN-6301 A job in the job list side panel shows both <code>pending</code> and <code>running</code> at the same time."},{"location":"home/whats-new-2-10/#fixed-issues_4","title":"Fixed issues","text":"Internal ID Description RUN-8223 Missed foreign key to tenants table. RUN-5187 S3 can now be configured to work in airgapped environments. RUN-8276 503 error when creating a workload (request timeout for validation webhook). RUN-7266 Allocation bug - a researcher asked for 2 GPU for Interactive Job and other jobs received the allocated GPU within the same node RUN-8418 different user when submitting via runai cli and vi ui submit form RUN-6838 When submitting a job with port out of range, the job is submitted successfully however the submission actually fails. RUN-8196 Nodepools aren't visible in 2.9 UI. RUN-7435 Run:ai CLI submit doesn't parse correctly environment variables that end with a '='. RUN-8192 The UI shows a deleted job in the Current Jobs tab. RUN-7776 User does not exist in the UI due to pagination limitation."},{"location":"home/whats-new-2-12/","title":"Run:ai version 2.12","text":""},{"location":"home/whats-new-2-12/#version-2124","title":"Version 2.12.4","text":""},{"location":"home/whats-new-2-12/#release-date","title":"Release date","text":"<p>June 2023</p>"},{"location":"home/whats-new-2-12/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-10129 Fixed an issue where logs of completed trainings were not displayed in the UI."},{"location":"home/whats-new-2-12/#version-2123","title":"Version 2.12.3","text":""},{"location":"home/whats-new-2-12/#release-date_1","title":"Release date","text":"<p>May 2023</p> <ul> <li>Added an option in the UI to login as a local user in OpenShift environments.</li> </ul>"},{"location":"home/whats-new-2-12/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-10052 Fixed an issue where a UI change is needed to submit a workload after a template was used to create a job. RUN-10000 Fixed a UI issue that affected some users, where the job submission form wasn't working due to a missing value in a JWT token. RUN-9576 Added a missing endpoint (<code>projectDepartment</code>) to the API documentation site."},{"location":"home/whats-new-2-12/#version-2121","title":"Version 2.12.1","text":""},{"location":"home/whats-new-2-12/#release-date_2","title":"Release date","text":"<p>May 2023</p> <p>Improved the readability of unsuccessful node scheduling in the message log.</p>"},{"location":"home/whats-new-2-12/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-9326 Fixed an issue that affected the dashboard where projects created with fractional GPUs, display the number of GPUs rounded down to nearest whole number. RUN-9039 Fixed an issue that displays jobs as interactive-preemptible after cycling the <code>preemptible</code> flag from on to off."},{"location":"home/whats-new-2-12/#version-2120","title":"Version 2.12.0","text":""},{"location":"home/whats-new-2-12/#release-date_3","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-12/#release-content","title":"Release content","text":""},{"location":"home/whats-new-2-12/#compatibility","title":"Compatibility","text":"<ul> <li> <p>Removed support for OpenShift 4.8 and 4.9, and Kubernetes 1.21 and 1.22.</p> </li> <li> <p>Added support for OpenShift 4.12 and Kubernetes 1.27.</p> </li> <li> <p>The <code>runai top job</code> command was deprecated.</p> </li> </ul> <p>Single sign on</p> <p>When SSO is enabled, you can still create and authenticate with Local users. For configuration of local users and SSO users, see Create a new user.</p>"},{"location":"home/whats-new-2-12/#researcher-tools-enhancements","title":"Researcher tools enhancements","text":"<p>OpenShift Dev Spaces</p> <p>Added support fort Openshift Dev Spaces custom resource definitions using the RunAI scheduler.</p> <p>Training Experience</p> <p>Added a new feature that allows the researcher to provision a training job in a project using a wizard like flow.</p> <p>DeepSpeed Integration</p> <p>Added integration and certification with DeepSpeed for multi pod using open-mpi. See DeepSpeed Integration.</p> <p>Cluster API</p> <p>Added <code>Stop</code>, <code>Suspend</code>, and <code>Resume</code> jobs options to Submitting Workloads via YAML.</p> <p>Comet Integration</p> <p>Comet builds tools that help data scientists, engineers, and team leaders accelerate and optimize machine learning and deep learning models. This integration with Run:ai provides organizations of every size a platform to build better ML models faster. For more information, see Comet. For configuration information, see Comet integration.</p>"},{"location":"home/whats-new-2-12/#known-issues","title":"Known issues","text":"<p>None</p>"},{"location":"home/whats-new-2-12/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-6827 Fixed an issue where the ellipsis remains in the Dashboard when using Firefox after a long idle time. RUN-8621 Fixed the error response to 204 when changing to a custom logo. RUN-8662 Fixed grayed out submit button when using a template with pvc. RUN-8890 Fixed a scheduler panic when both a project and a department use the same name. RUN-9035 Fixed an issue that allowed a scheduler from any node pool to delete reservation pods created on a different node pool which may have caused a failure to schedule jobs with fractional GPU allocations. RUN-9089 Added the <code>port forward</code> CLI command. RUN-9166 Fixed incorrect response about resource availability in messages indicating why a specific job failed to schedule. RUN-9259 Fixed an issue where cluster sync requests are not working at scale due to a large number of requests."},{"location":"home/whats-new-2-13/","title":"Run:ai version 2.13","text":""},{"location":"home/whats-new-2-13/#version-2130","title":"Version 2.13.0","text":""},{"location":"home/whats-new-2-13/#release-content","title":"Release content","text":"<p>This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes. For information about features, functionality, and fixed issues in previous versions see:</p> <ul> <li>What's new 2.12</li> <li>What's new 2.10</li> <li>What's new 2.9</li> </ul> <p>Projects</p> <ul> <li>Improved the Projects UI for ease of use. Projects follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see Projects.</li> </ul> <p>Dashboards</p> <ul> <li> <p>Added a new dashboard for Quota management, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on Departments, Projects, and Node pools. For more information, see Quota management dashboard.</p> </li> <li> <p>Added to the Overview dashboard, the ability to filter the cluster by one or more node pools. For more information, see Node pools.</p> </li> </ul> <p>Nodes and Node pools</p> <ul> <li> <p>Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see Scheduling strategies. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see Creating new node pools.</p> </li> <li> <p>GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See DCGM Metrics for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.</p> </li> </ul> <ul> <li>Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see Over-quota priority.</li> </ul> <ul> <li>Added support of associating workspaces to node pool. The association between workspaces and node pools is done using Compute resources section. In order to associate a compute resource to a node pool, in the Compute resource section, press More settings. Press Add new to add more node pools to the configuration. Drag and drop the node pools to set their priority.</li> </ul> <ul> <li>Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</li> </ul> <p>Time limit duration</p> <ul> <li> <p>Improved the behavior of any workload time limit (for example, Idle time limit) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see Limit duration of interactive training jobs.</p> </li> <li> <p>Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of <code>stopped</code> so that they can be reactivated later.</p> </li> <li> <p>Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated. </p> </li> </ul> <p>Workload assets</p> <ul> <li>Extended the collaboration functionality for any workload asset such as Environment, Compute resource, and some Data source types. These assets are now shared with Departments in the organization in addition to being shared with specific projects, or the entire cluster.</li> </ul> <ul> <li>Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.</li> </ul> <p>PVC data sources</p> <ul> <li>Added support for PVC block storage in the New data source form. In the New data source form for a new PVC data source, in the Volume mode field, select from Filesystem or Block. For more information, see Create a PVC data source.</li> </ul> <p>Credentials</p> <ul> <li>Added Docker registry to the Credentials menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see Configuring credentials.</li> </ul> <p>Policies</p> <ul> <li>Improved policy support by adding <code>DEFAULTS</code> in the <code>items</code> section in the policy. The <code>DEFAULTS</code> section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, Complex values.</li> </ul> <ul> <li>Added support for making a PVC data source available to all projects. In the New data source form, when creating a new PVC data source, select All from the Project pane.</li> </ul> <p>Researcher API</p> <ul> <li>Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see Submitting Workloads via HTTP/REST.</li> </ul> <p>Integrations</p> <ul> <li>Added support for Spark and Elastic jobs. For more information, see Running Spark jobs with Run:ai.</li> </ul> <ul> <li> <p>Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see Integrate Run:ai with Ray.</p> </li> <li> <p>Added integration with Weights &amp; Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see Sweep configuration.</p> </li> </ul> <ul> <li>Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see runai submit-dist xgboost</li> </ul> <p>Compatability</p> <ul> <li>Added support for multiple OpenShift clusters. For configuration information, see Installing additional Clusters.</li> </ul>"},{"location":"home/whats-new-2-13/#installation","title":"Installation","text":"<ul> <li>The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.</li> <li>From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a backend values file. Instead, install directly using <code>helm</code> as described in Install the Run:ai Control Plane.  </li> <li>From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a <code>custom-env.yaml</code> values file during the preparation stage. This is used when installing the control-plane.</li> </ul>"},{"location":"home/whats-new-2-13/#known-issues","title":"Known issues","text":"Internal ID Description RUN-11005 Incorrect error messages when trying to run <code>runai</code> CLI commands in an OpenShift environment. RUN-11009 Incorrect error message when a user without permissions to tries to delete another user."},{"location":"home/whats-new-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-9039 Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible. RUN-9323 Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful. RUN-9324 Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready. RUN-9902 Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn\u2019t have permissions to monitor the <code>runai</code> namespace after an installation or upgrade to 2.9. RUN-9920 Fixed an issue where the <code>canEdit</code> key in a policy is not validated properly for itemized fields when configuring an interactive policy. RUN-10052 Fixed an issue when loading a new job from a template gives an error until there are changes made on the form. RUN-10053 Fixed an issue where the Node pool column is unsearchable in the job list. RUN-10422 Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.). RUN-10500 Fixed an issue where jobs are shown as running even though they don't exist in the cluster. RUN-10813 Fixed an issue in adding a <code>data source</code> where the path is case sensitive and didn't allow uppercase."},{"location":"home/whats-new-2-8/","title":"Run:ai Version 2.8","text":""},{"location":"home/whats-new-2-8/#release-date","title":"Release Date","text":"<p>November 2022 </p>"},{"location":"home/whats-new-2-8/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-8/#node-pools","title":"Node Pools","text":"<p>Node Pools is a new method for managing GPU and CPU resources by grouping the resources into distinct pools. With node pools:</p> <ul> <li>The administrator allocates Project and Department resources from these pools to be used by Workloads. </li> <li>The administrator controls which workloads can use which resources, allowing an optimized utilization of resources according to customer's specific mode of operation. </li> </ul>"},{"location":"home/whats-new-2-8/#user-interface-enhancements","title":"User Interface Enhancements","text":"<ul> <li>The Departments screen has been revamped and new functionality added, including a new and clean look and feel, and improved search and filtering capabilities.</li> <li>The Jobs screen has been split into 2 tabs for ease of use:</li> <li>Current:  (the default tab) consists of all the jobs that currently exist in the cluster. </li> <li>History:  consists of all the jobs that have been deleted from the cluster. Deleting Jobs also deletes their Log (no change).</li> </ul>"},{"location":"home/whats-new-2-8/#installation-improvements","title":"Installation improvements","text":"<p>The Run:ai user interface requires a URL address to the Kubernetes cluster. The requirement is relevant for SaaS installation only. </p> <p>In previous versions of Run:ai the administrator should provide an IP address and Run:ai would automatically create a DNS entry for it and a matching trusted certificate. </p> <p>In version 2.8,  the default is for the Run:ai administrator to provide a DNS and a trusted certificate. </p> <p>The older option still exists but is being deprecated due to complexity.</p>"},{"location":"home/whats-new-2-8/#inference","title":"Inference","text":"<p>The Deployment details page now contains the URL for the Inference service </p>"},{"location":"home/whats-new-2-8/#hyperparameter-optimization-hpo","title":"Hyperparameter Optimization (HPO)","text":"<p>HPO Jobs are now presented as a single line in the Job List rather than a separate line per experiment. </p>"},{"location":"home/whats-new-2-8/#known-issues","title":"Known Issues","text":"Internal ID Description Workaround RUN-6236 The Run:ai access control system prevents setting a role of researcher together with ML engineer or researcher manager at the same time. However, using the UI you can select these two roles by clicking the text near the check None RUN-6218 When installing Run:ai on OpenShift a second time, oauth client secret is incorrect/not updated. As a result, login is not possible Can be performed via manual configuration. Please contact Run:ai support. RUN-6216 In the multi cluster overview, the allocated GPU in the table of each cluster is wrong. The correct number is in the overview dashboard. None RUN-6190 When deleting a cluster, there are leftover pods that are not deleted. No side effects on functionality. Delete the pods manually. RUN-5855 (SaaS version only) The new control plane, versioned 2.8 does not allow the creation of a new deployment on a cluster whose version is lower than 2.8. Upgrade your cluster to 2.8 RUN-5780 It is possible to change runai/node-pool label of a running pod. This is a wrong usage of the system and may cause unexpected behavior. None. RUN-5527 Idle allocated GPU metric is not displayed for MIG workloads in OpenShift. None RUN-5519 When selecting a Job, the GPU memory utilization metrics is not displayed on the right-hand side. This is an NVIDIA DCGM known bug (see:  https://github.com/NVIDIA/dcgm-exporter/issues/103 ) which has been fixed in a later version but was not yet included in the latest NVIDIA GPU Operator Install the suggested version as described by NVIDIA. RUN-5478 Dashboard panels of GPU Allocation/project and Allocated jobs per project metrics:  In rare cases, some metrics reflect the wrong number of GPUs None RUN-5444 Dynamic MIG feature does not work with A-100 with 80GB of memory. None RUN-5424 When a workload is selected in the job list, the GPU tab in the right panel, shows the details of the whole GPUs in the node, instead of the details of the GPUs used by the workload. None RUN-5226 In rare occasions, when there is more than 1 NVIDIA MIG workload, nvidia-smi command to one of the workloads will result with no devices. None RUN-6359 In rare cases, when using fractions and the kubelet service on the scheduled node is down (Kubernetes not running on node)the pending workload will never run, even when the IT problem is solved. Delete the job and re-submit the workload. RUN-6399 Requested GPUs are sometimes displayed in the Job list as 0 for distributed workloads. None. This is a display-only issue RUN-6400 On EKS (Amazon Kubernetes Server), when using runai CLI, every command response starts with an error. No functionality harm. None. The CLI functions as expected."},{"location":"home/whats-new-2-8/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-5676 When Interactive Jupyter notebook workloads that contain passwords are cloned, the password is exposed in the displayed CLI command. RUN-5457 When using the Home environment variable in conjunction with the ran-as-user option in the CLI, the Home environment variable is overwritten with the user's home directory. RUN-5370 It is possible to submit two jobs with the same node-port. RUN-5314 When you apply an inference deployment via a file, the allocated GPUs are displayed as 0 in the deployments list. RUN-5284 When workloads are deleted while the cluster synchronization is down, there might be a non-existent Job shown in the user interface. The Job cannot be deleted. RUN-5160 In some situations, when a Job is deleted, there may be leftover Kubernetes configMaps in the system RUN-5154 In some cases, an error \"failed to load data\" can be seen in the graphs showing on the Job sidebar. RUN-5145 The default Kubernetes \"priority Class\" for deployments is the same as the priority class for interactive jobs. RUN-5039 In some scenarios, Dashboards may show \"found duplicate series for the match group\" error RUN-4941 The scheduler is wrongly trying to schedule jobs on a node, where there are allocated GPU jobs at an \"ImagePullBackoff\" state. This causes an error of \"UnexpectedAdmissionError\" RUN-4574 The role \"Researcher Manager\" is not displayed in the access control list of projects. RUN-4554 Users are trying to login with single-sign-on get a \"review profile\" page. RUN-4464 Single HPO (hyperparameter optimization) workload is displayed in the Job list user interfgace as multiple jobs (one for every pod)."},{"location":"home/whats-new-2-9/","title":"Run:ai Version 2.9","text":""},{"location":"home/whats-new-2-9/#version-299","title":"Version 2.9.9","text":"Internal ID Description RUN-10333 Fixed an issue with allowing a fractional GPU value of 0 when submitting jobs via YAML. RUN-9920 Fixed an issue with policies where the <code>canEdit</code> rule is not validated properly for itemized fields. RUN-9912 Fixed an issue where <code>runai bash</code> does not wait for pods to be ready. RUN-9902 Fixed an issue with Prometheus permissions in OpenShift environments. RUN-9326 Fixed an issue that affected the dashboard where projects created with fractional GPUs, display the number of GPUs rounded down to nearest whole number."},{"location":"home/whats-new-2-9/#version-297","title":"Version 2.9.7","text":""},{"location":"home/whats-new-2-9/#release-date","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-9/#fixed-issues","title":"Fixed Issues","text":"Internal ID Description RUN-8989 Fixed openshift authentication for users lacking email so that they can submit jobs using the UI. RUN-9488 Fixed certificate error when retrieving dashboards in environments that are using a self-signed certificate."},{"location":"home/whats-new-2-9/#release-date_1","title":"Release Date","text":"<p>February 2023</p>"},{"location":"home/whats-new-2-9/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-9/#authentication","title":"Authentication","text":"<p>OpenShift groups</p> <p>Ability to manage access control through IDP groups declaration - groups are managed from the OpenShift platform and integrated into Run:ai platform, as opposed to group management in vanilla k8s with SSO. OpenShift doesn\u2019t need any additional configuration as this comes built-in with regular installation or the upgrade option.</p> <p>UID/GID for SSO users</p> <p>When running a workload through the UI the Run:ai platform now automatically injects the UID and GID into the created container. This has changed from previous versions where the user would enter data in these fields manually. This is designed for environments where UIDs and GIDs are managed in an SSO server, and Run:ai is configured with SSO.   </p> <p>SSO: block access to Run:ai</p> <p>When configuring SSO in the Run:ai platform all users are assigned a new default role. It means an SSO user will not have any access to the Run:ai platform unless a manager explicitly assigns additional roles via the user or group management areas.</p> <p>Run CPU over-quota workloads</p> <p>Added support for CPU workloads to support over-quota - CPU resources fairness was added to the Run:ai scheduler in addition to the GPU fairness that is already supported. The updated fairness algorithm takes into account all resource types (GPU, CPU compute and CPU memory) and is supported regardless of node pool configuration. </p>"},{"location":"home/whats-new-2-9/#runai-workspaces","title":"Run:ai Workspaces","text":"<p>A Run:ai workspace is a simplified, efficient tool for researchers to conduct their experiments, build AI models, access standard MLOps tools, and collaborate with their peers.</p> <p>Run:ai workspaces abstract complex concepts related to running containerized workloads in a Kubernetes environment, such as networking, storage, and secrets, and are built from predefined abstracted setups, that ease and streamline the researcher AI models development. A workspace consists of container images, data sets, resource requests, and all the required tools for the research. They are quickly created with the workspace wizard. For more information see Workspaces.</p>"},{"location":"home/whats-new-2-9/#new-supported-tools-for-researchers","title":"New supported tools for researchers","text":"<p>As part of the introduction of Run:ai workspaces a few new development and research tools were added. The new supported tools are: RStudio, Visual Studio Code, Matlab and Weights and Biases (see full details). This is an addition to adding already supported tools, such as JupyterNotebook and TensorBoard to Run:ai workspaces.</p> <p>Weight and Biases</p> <p>Weights and Biases is a commercial tool that provides experiment tracking, model visualization, and collaboration for machine learning projects. It helps researchers and developers keep track of their experiments, visualize their results, and compare different models to make informed decisions. This integration provides data researchers with connectivity between the running Workspace in Run:ai and the relevant project for experiment tracking. For more information, see Weights and Biases</p> <p>Node pools enhancements</p> <p>Added additional support to multi-node pools. This new capability allows the researcher to specify a prioritized list of node pools for the Run:ai scheduler to use. Researchers now gain the flexibility to use multiple resource types and maximize the utilization of the system\u2019s GPU and CPU resources. Administrators now have the option to set a default Project (namespace) level with a prioritized list of node pools that a workload will use if the researcher did not set its own priorities.</p> <p>New nodes and node pools Screens</p> <p>Run:ai has revised the nodes table, adding new information fields and graphs. It is now easier to assess how resources are allocated and utilized. Run:ai has also added a new \u2018node pools\u2019 table where Administrators can add a new node pool, update, and delete an existing node pool. In addition, the node pools table presents a large number of metrics and details about each of the node pools. A set of graphs reflect the node pools\u2019 resource status over time according to different criteria.</p> <p>Consumption Dashboard</p> <p>Added a \u201cConsumption\u201d dashboard. When enabled by the \u201cShow Consumption Dashboard\u201d alpha flag under \u201cSettings\u201d, this dashboard allows the admin to review consumption patterns for GPUs, CPUs and RAM over time. You can segregate consumption by over or in-quota allocation in the project or department level. For more information, see Consumption dashboard.</p> <p>Event History (Audit Log UI)</p> <p>Added the option for Administrators to view the system\u2019s Audit Log via the Run:ai user interface. Configuration changes and other administrative operations (login/logout etc) are saved in an Audit Log facility. Administrators can browse through the Admin Log (Event History), download as a JSON or CSV, filter specific date periods, set multiple criteria filters, and decide which information fields to view.</p> <p>Idle jobs timeout policy</p> <p>Added an option \u2018Editor\u2019 so that Administrators can terminate idle workloads by setting the criteria of \u2018idle time\u2019 per project so that the editor can identify and terminate idle Training and Interactive (build) workloads. This is used for maximizing and maintaining system sanitation.</p>"},{"location":"home/whats-new-2-9/#installation-enhancements","title":"Installation Enhancements","text":""},{"location":"home/whats-new-2-9/#cluster-upgrade","title":"Cluster Upgrade","text":"<p>Cluster upgrade to 2.9 requires uninstalling and then installing. No data is lost during the process. For more information see cluster upgrade.</p> <p>Using an IP address for a cluster URL is no longer available in this version. You must use a domain name.</p>"},{"location":"home/whats-new-2-9/#cluster-prerequisites","title":"Cluster Prerequisites","text":"<ul> <li> <p>Prometheus is no longer installed together with Run:ai. You must install the Prometheus stack before installing Run:ai. This is designed for organizations that already have Prometheus installed in the cluster. The Run:ai installation configures the existing Prometheus with a custom set of rules designed to extract metrics from the cluster.</p> </li> <li> <p>NGINX is no longer installed together with Run:ai. You must install an Ingress controller before installing Run:ai. This is designed for organizations that already have an ingress controller installed. The Run:ai installation creates NGINX rules to work with the controller.</p> </li> <li> <p>List of Run:ai installation Prerequisites can be found here.</p> </li> <li> <p>The Run:ai installation now performs a series of checks to verify the installation's validity. When the installation is complete, verify by reviewing the following in the log file:</p> <ul> <li>Are all mandatory prerequisites met?</li> <li>Are optional prerequisites met?</li> <li>Does the cluster have connectivity to the Run:ai control plane?</li> <li>Does Run:ai support the underlying Kubernetes version?</li> </ul> </li> </ul>"},{"location":"home/whats-new-2-9/#control-plane-upgrade","title":"Control Plane Upgrade","text":"<p>A special process is required to upgrade the control-plane to version 2.9. </p>"},{"location":"home/whats-new-2-9/#control-plane-prerequisites","title":"Control plane Prerequisites","text":"<ul> <li> <p>Run:ai control plane installation no longer installs NGINX. You must pre-install an ingress controller.</p> </li> <li> <p>The default persistent storage is now a default storage class preconfigured in Kubernetes rather than the older NFS assumptions. NFS flags in <code>runai-adm</code> generate-values still exist for backward compatibility.</p> </li> </ul>"},{"location":"home/whats-new-2-9/#other","title":"Other","text":"<p>Cluster Wizard has been simplified for environments with multiple clusters   in a self-hosted configuration. Clusters are now easier to configure. Choose a cluster location: </p> <ul> <li>Same as Control Plane. </li> <li>Remote to Control Plane. </li> </ul>"},{"location":"home/whats-new-2-9/#new-supported-software","title":"New Supported Software","text":"<ul> <li>Run:ai now supports Kubernetes 1.25 and 1.26.</li> <li>Run:ai now supports OpenShift 4.11</li> <li>Run:ai now supports Dynamic MIG with NVIDIA H100 hardware</li> <li>The Run:ai command-line interface now supports Microsoft Windows. See Install the Run:ai Command-line Interface.</li> </ul>"},{"location":"home/whats-new-2-9/#known-issues","title":"Known Issues","text":"Internal ID Description Workaround RUN-7874 When a project is not connected to a namespace - new job returns \"malformed URL\" None RUN-7617 Cannot delete Node affinity from project after it was created Remove it using the API."},{"location":"home/whats-new-2-9/#fixed-issues_1","title":"Fixed Issues","text":"Internal ID Description RUN-7776 user does not exist in the UI due to pagination limitation RUN-6995 Group Mapping from SSO Group to Researcher Manager Role no working RUN-6460 S3 Fail (read/write in Jupyter notebook) RUN-6445 Project can be created with deleted node pool RUN-6400 EKS - Every command response in runai CLI starts with an error. No functionality harm RUN-6399 Requested GPU is always 0 for MPI jobs, making also other metrics wrong RUN-6359 Job gets UnexpectedAdmissionError race condition with Kubelet RUN-6272 runai pod which owner is not RunaiJob - Do not allow deletion, suspension, cloning RUN-6218 When installing Run:ai on OpenShift a second time, oauth client secret is incorrect/not updated RUN-6216 Multi cluster: allocated GPU is wrong as a result of metric not counting jobs in error RUN-6029 CLI Submit git sync severe bug RUN-6027 [Security Issue] Job submitted with github sync -- Password is displayed in the UI RUN-5822 Environment Variables in the UI do not honor the \"canRemove:false\" attribute in Policy RUN-5676 Security issue with \"Clone Job\" functionality RUN-5527 Metrics (MIG - OCP): GPU Idle Allocated GPUs show No Data RUN-5478 # of GPUs is higher than existing GPUs in the cluster RUN-5444 MIG doesn't work on A100 - 80GB RUN-5424 Deployment GPUs tab shows all the GPUs on the node instead of the ones in use by the deployment RUN-5370 Can submit job with the same node port + imagePullpolicy RUN-5226 MIG job can't see device after submitting a different mig job RUN-4869 S3 jobs run forever with NotReady state RUN-4244 Run:ai Alertmanager shows false positive errors on Agent"},{"location":"home/whats-new-2020/","title":"Whats New 2020","text":""},{"location":"home/whats-new-2020/#december-28th-2020","title":"December 28th, 2020","text":"<p>It is now possible to allocate a specific amount of GPU memory rather than use the fraction syntax. Use <code>--gpu-memory=5G</code>.</p>"},{"location":"home/whats-new-2020/#december-15th-2020","title":"December 15th, 2020","text":"<p>Project and Departments can now be set to not allocate resources beyond the assigned GPUs. This is useful for budget-conscious Projects/Departments. </p>"},{"location":"home/whats-new-2020/#december-1st-2020","title":"December 1st, 2020","text":"<p>New integration documents:</p> <ul> <li>Apache Airflow</li> <li>TensorBoard</li> </ul>"},{"location":"home/whats-new-2020/#november-25th-2020","title":"November 25th, 2020","text":"<p>Syntax changes in CLI:</p> <ul> <li><code>runai &lt;object&gt; list</code>  has been replaced by <code>runai list &lt;object&gt;</code>.</li> <li><code>runai get</code> has been replaced by <code>runai describe job</code>.</li> <li><code>runai &lt;object&gt; set</code> has been replaced by <code>runai config &lt;object&gt;</code>.</li> </ul> <p>The older style will still work with a deprecation notice.</p> <p><code>runai top node</code> has been revamped.</p>"},{"location":"home/whats-new-2020/#november-12th-2020","title":"November 12th, 2020","text":"<p>An Admin can now create templates for the Command-line interface. Both a default template and specific templates, that can be used with the --template flag. The new templates allow for mandatory values, defaults, and run-time environment variable resolution.</p> <p>It is now also possible to pass Secrets to Job. see here</p>"},{"location":"home/whats-new-2020/#november-2nd-2020","title":"November 2nd, 2020","text":"<p>Several changes and additions to the Command-line interface:</p> <ul> <li>Passing a command and arguments is now done docker-style by adding <code>--</code> at the end of the command</li> <li>You no longer need to provide a Job name. If you don't, a Job name will be generated automatically. You can also control the job-name prefix using an additional flag. </li> <li>New <code>--image-pull-policy</code> flag, allowing Researcher support for updating images without tagging.</li> </ul> <p>For further information see runai submit</p>"},{"location":"home/whats-new-2020/#september-6th-2020","title":"September 6th, 2020","text":"<p>We released a module that helps the Researcher perform Hyperparameter optimization (HPO). HPO is about running many smaller experiments with varying parameters to help determine the optimal parameter set Hyperparameter Optimization Quickstart</p>"},{"location":"home/whats-new-2020/#september-3rd-2020","title":"September 3rd, 2020","text":"<p>GPU Fractions now run in training and not only interactive. GPU Fractions training Job can be preempted, bin-packed and consolidated like any integer Job. See Run:ai Scheduler Fraction for more.</p>"},{"location":"home/whats-new-2020/#august-10th-2020","title":"August 10th, 2020","text":"<p>Run:ai Now supports Distributed Training and Gang Scheduling. For further information, see the Launch Distributed Training Workloads quickstart.</p>"},{"location":"home/whats-new-2020/#august-4th-2020","title":"August 4th, 2020","text":"<p>There is now an optional second level of Project hierarchy called Departments. For further information on how to configure and use Departments, see Working with Departments </p>"},{"location":"home/whats-new-2020/#july-28th-2020","title":"July 28th, 2020","text":"<p>You can now enforce a cluster-wise setting that mandates all containers running using the Run:ai CLI to run as non root. For further information, see Enforce non-root Containers</p>"},{"location":"home/whats-new-2020/#july-21th-2020","title":"July 21th, 2020","text":"<p>It is now possible to mount a Persistent Storage Claim using the Run:ai CLI. See the <code>--pvc</code> flag in the runai submit CLI flag</p>"},{"location":"home/whats-new-2020/#june-13th-2020","title":"June 13th, 2020","text":""},{"location":"home/whats-new-2020/#new-settings-for-the-allocation-of-cpu-and-memory","title":"New Settings for the Allocation of CPU and Memory","text":"<p>It is now possible to set limits for CPU and memory as well as to establish defaults based on the ratio of GPU to CPU and GPU to memory. </p> <p>For further information see: Allocation of CPU and Memory</p>"},{"location":"home/whats-new-2020/#june-3rd-2020","title":"June 3rd, 2020","text":""},{"location":"home/whats-new-2020/#node-group-affinity","title":"Node Group Affinity","text":"<p>Projects now support Node Affinity. This feature allows the Administrator to assign specific Projects to run only on specific nodes (machines). Example use cases:</p> <ul> <li>The Project team needs specialized hardware (e.g. with enough memory)</li> <li>The Project team is the owner of specific hardware which was acquired with a specialized budget</li> <li>We want to direct build/interactive workloads to work on weaker hardware and direct longer training/unattended workloads to faster nodes</li> </ul> <p>For further information see: Working with Projects</p>"},{"location":"home/whats-new-2020/#limit-duration-of-interactive-jobs","title":"Limit Duration of Interactive Jobs","text":"<p>Researchers frequently forget to close Interactive Job. This may lead to a waste of resources. Some organizations prefer to limit the duration of interactive Job and close them automatically. </p> <p>For further information on how to set up duration limits see: Working with Projects</p>"},{"location":"home/whats-new-2020/#may-24th-2020","title":"May 24th, 2020","text":""},{"location":"home/whats-new-2020/#kubernetes-operators","title":"Kubernetes Operators","text":"<p>Cluster installation now works with Kubernetes Operators. Operators make it easy to install, update, and delete a Run:ai cluster. </p> <p>For further information see: Upgrading a Run:ai Cluster Installation and Deleting a a Run:ai Cluster Installation</p>"},{"location":"home/whats-new-2020/#march-3rd-2020","title":"March 3rd, 2020","text":""},{"location":"home/whats-new-2020/#admin-overview-dashboard","title":"Admin Overview Dashboard","text":"<p>A new admin overview dashboard that shows a more holistic view of multiple clusters. Applicable for customers with more than one cluster.</p>"},{"location":"home/whats-new-2021/","title":"Whats New 2021","text":""},{"location":"home/whats-new-2021/#december-8th-2021","title":"December 8th 2021","text":"<p>To comply with organizational policies and enhance the Run:ai platform security, Run:ai now supports Single Sign-On (SSO). This functionality is currently in beta and is available for new customer tenants only. For further details on SSO see Single Sign-On.</p> <p>To optimize resource management and utilization of Nvidia GPUs based on Ampere architecture, such as A100, Run:ai now supports dynamic creation and allocation of MIG partitions. This functionality is currently in beta. For further details on the dynamic allocation of MIG partitions see Dynamic MIG.</p> <p>Run:ai now supports AI workloads running in containerized clusters based on the VMWare Tanzu orchestrator. For further details on supported orchestrators see the prerequisites document.</p> <p>Supportability enhancements:</p> <ul> <li>A new \"Status History\" tab has been added to the job details view. The new tab shows the details of each status change of each job and allows researchers to analyze how to improve experiments as well as equip administrators with a tool to analyze running and historical jobs. In addition, the details of the reason a job is in the current status are available when hovering over the job status on the jobs table.</li> <li>To improve the ability to monitor the Run:ai environment, Run:ai components now expose alerts indicating whether the component is running. For further details on cluster monitoring see Cluster Monitoring</li> </ul> <p>User Experience (UX) enhancements:</p> <ul> <li>Run:ai cluster version is now available in the clusters list.</li> <li>Researchers can now submit and integrate with Git directly from the user interface.</li> </ul>"},{"location":"home/whats-new-2021/#october-29th-2021","title":"October 29th 2021","text":"<p>The Run:ai cluster now enforces the access definitions of the user and lists only jobs under permitted projects. For example, <code>runai list jobs</code>  will only show jobs from projects to which the researcher has access to.</p> <p>The Run:ai CLI <code>runai list projects</code> option now displays the quota definitions of each project.</p> <p>The Run:ai CLI port forwarding option now supports any IP address.</p> <p>The Run:ai CLI binary download is now signed with a checksum, to allow customers to validate the origin of the CLI and align with security best practices and standards.</p> <p>The Run:ai Researcher User Interface now supports setting GPU Memory as well as volumes in NFS servers.</p> <p>The Run:ai metrics used in the Dashboards are now officially documented and can be accessed via APIs as documented here.</p> <p>Run:ai now officially supports integration with Seldon Core. For more details read here.</p> <p>Run:ai now support VMWare Tanzu Kubernetes.</p>"},{"location":"home/whats-new-2021/#august-30th-2021","title":"August 30th 2021","text":"<p>Run:ai now supports a self-hosted installation. With the self-hosted installation the Run:ai control-plane which typically resides on the cloud, is deployed at the customer's data center. For further details on  supported installation types see Installation Types.</p> <p>Note</p> <p>The Run:ai self-hosted installation requires a dedicated license, and has different pricing than the SaaS installation. For more details contact your Run:ai account manager.</p> <p>NFS volumes can now be mounted directly to containers run by Run:ai while submitting jobs via Run:ai. See the <code>--nfs-server</code> flag of runai submit.</p> <p>To ease the manageability of user templates, Run:ai now supports global user templates. Global user templates are user templates that are managed by the Run:ai administrator and are available for all the projects within a specific cluster. The purpose of global user templates is to help define and enforce cross-organization resource policies.</p> <p>To simplify researchers' job submission via the Run:ai Researcher User Interface (UI), the UI now supports autocomplete, which is based on pre-defined values, as configured by the Administrator using the administrative templates.</p> <p>Run:ai extended the usage of Cluster name, as defined by the Administrator while configuring clusters at Run:ai. The Cluster name is now populated to the Run:ai dashboards as well as the Researcher UI.</p> <p>The original command line, which was used for running a Job, is now shown under the Job details under the General tab.</p>"},{"location":"home/whats-new-2021/#august-4th-2021","title":"August 4th 2021","text":"<p>Researcher User Interface (UI) enhancements:</p> <ul> <li>Revised user interface and user experience</li> <li>Researchers can create templates for the ease of jobs submission. Templates can be saved and used at the project level</li> <li>Researchers can be easily re-submit jobs from the Submit page or directly from the jobs list on the Jobs page</li> <li>Administrators can create administrative templates which set cluster-wide defaults, constraints, and defaults for the submission of Jobs. </li> <li>Different teams can collaborate and share templates by exporting and importing templates in the Submit screen</li> </ul> <p>Researcher Command Line Interface (CLI) enhancements:</p> <ul> <li>Jobs can be manually suspended and resumed using the new commands: <code>runai suspend</code> &amp; <code>runai resume</code></li> <li>A new command was added: <code>runai top job</code></li> </ul> <p>Kubeflow integration is now supported. The new integration allows building ML pipelines in Kubeflow Pipelines as well as Kubeflow Notebooks and run the workloads via the Run:ai platform. For further details see Integrate Run:ai with Kubeflow.</p> <p>Mlflow integration is now supported. For further details see Integrate Run:ai with MLflow.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. Run:ai now supports customizable namespace names. For further details see Manual Creation of Namespaces.</p>"},{"location":"home/whats-new-2021/#may-10th-2021","title":"May 10th 2021","text":"<p>Usability improvements of Run:ai Command-line interface (CLI). The CLI now supports autocomplete for all options and parameters.</p> <p>Usability improvements of the Administration user interface navigation menu now allow for easier navigation.</p> <p>Run:ai can be installed when Kubernetes has Pod Security Policy (PSP) enabled.</p>"},{"location":"home/whats-new-2021/#april-20th-2021","title":"April 20th 2021","text":"<p>Job List and Node list now show the GPU type (e.g. v-100).</p>"},{"location":"home/whats-new-2021/#april-18th-2021","title":"April 18th, 2021","text":"<p>Inference workloads are now supported. For further details see Inference Overview.</p> <p>JupyterHub integration is now supported. For further details see JupyterHub Integration</p> <p>NVIDIA MIG is now supported. You can use the NVIDIA MIG technology to partition A-100 GPUs. Each partition will be considered as a single GPU by the Run:ai system and all the Run:ai functionality is supported in the partition level, including GPU Fractions.</p>"},{"location":"home/whats-new-2021/#april-1st-2021","title":"April 1st, 2021","text":"<p>Run:ai now supports Kubernetes 1.20</p>"},{"location":"home/whats-new-2021/#march-24th-2021","title":"March 24th 2021","text":"<p>Job List and Node list now show CPU utilization and CPU memory utilization.</p>"},{"location":"home/whats-new-2021/#february-14th-2021","title":"February 14th, 2021","text":"<p>The Job list now shows per-Job graphs for GPU utilization, GPU memory. </p> <p>The Node list now shows per-Node graphs for GPU utilization, GPU memory. </p>"},{"location":"home/whats-new-2021/#january-22nd-2021","title":"January 22nd, 2021","text":"<p>New Analytics dashboard with emphasis on CPU, CPU Memory, GPU, and GPU Memory. Allows better diagnostics of resource misuse. </p>"},{"location":"home/whats-new-2021/#january-15th-2021","title":"January 15th, 2021","text":"<p>A new developer documentation area has been created. In it:</p> <ul> <li>New documentation for Researcher REST API.</li> <li>New documentation for Administration Rest API.</li> <li>Kubernetes-based API for job creation.</li> </ul>"},{"location":"home/whats-new-2021/#january-9th-2021","title":"January 9th 2021","text":"<p>A new Researcher user interface is now available.</p>"},{"location":"home/whats-new-2021/#january-2nd-2021","title":"January 2nd, 2021","text":"<p>Run:ai Clusters now support Azure Managed Kubernetes Service (AKS)</p>"},{"location":"home/whats-new-2022/","title":"Whats New 2022","text":""},{"location":"home/whats-new-2022/#july-2022-runai-version-27","title":"July 2022 Run:ai Version 2.7","text":"<ul> <li>New Audit Log API is now available. The last login indication is now showing at the bottom left of the screen for single-sign-on users as well as regular users. </li> <li>Built-in Tensorboard support in the Run:ai user interface.</li> <li>You can now submit a Job and allocate Extended Kubernetes Resources. Extended resources are third-party devices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that you want to allocate to your Job. The third-party vendor has extended Kubernetes using a Device Plugin. Run:ai now allows the allocation of these resources via the Run:ai user interface Job form as well as the Run:ai Workload API. </li> <li>You can now submit a job with additional Linux Capabilities. Linux capabilities allow the researcher to give the Job additional permissions without actually giving the Job root access to the node. Run:ai allows adding such capabilities to the Job via the Run:ai user interface Job form as well as the Run:ai Workload API.  </li> </ul>"},{"location":"home/whats-new-2022/#june-2022-runai-version-26-cloud-update-only","title":"June 2022 Run:ai Version 2.6 (Cloud update only)","text":"<ul> <li>The login screen now provides the capability to recover a password. </li> <li>With single-sign-on, you can now (optionally) map the user's first and last name from the organizational directory. See single-sign-on prerequisites</li> <li>A new user role of ML Engineer. The role allows the user to view and manage inference deployments and cluster resources. </li> <li>Clearer documentation on how to perform port-forwarding when accessing the Run:ai cluster from Windows.</li> <li>Using the Run:ai user interface it is now possible to clone an existing Job. The clone operation will open a Job form and allow you to change parameters before re-submitting. </li> </ul>"},{"location":"home/whats-new-2022/#may-2022-runai-version-25","title":"May 2022 Run:ai Version 2.5","text":"<ul> <li>Command-line interface installation The command-line interface utility is no longer a separate install. Instead is now installed by logging into the control plane and downloading the utility which matches the cluster's version. </li> </ul> <p>Warning</p> <p>The command-line interface utility for version 2.3 is not compatible with a cluster version of 2.5 or later. If you upgrade the cluster, you must also upgrade the command-line interface. </p> <ul> <li>Inference. Run:ai inference offering has been overhauled with the ability to submit deployments via the user interface and a new and consistent API. For more information see Inference overview. To enable the new inference module call by Run:ai customer support.</li> <li>CPU and CPU memory quotas can now be configured for projects and departments. These are hard quotas which means that the total amount of the requested resource for all workloads associated with a project/department cannot exceed the set limit. To enable this feature please call Run:ai customer support.</li> <li>Workloads. We have revamped the way Run:ai submits Jobs. Run:ai now submits Workloads. The change includes:<ul> <li>New Cluster API. The older API has been deprecated and remains for backward compatibility. The API creates all the resources required for the run, including volumes, services, and the like. It also deletes all resources when the workload itself is deleted. </li> <li>Administrative templates have been replaced with Policies. Policies apply across all ways to submit jobs: command-line, API, and user interface. </li> </ul> </li> <li><code>runai delete</code> has been changed in favor of <code>runai delete job</code> </li> <li>Self-hosted installation: The default OpenShift installation is now set to work with a configured Openshift IdP. See creation of backend values for more information. In addition, the default for OpenShift is now HTTPS.</li> <li>To send logs to Run:ai customer support there is a utility to package all logs into one tar file. Version 2.5 brings a new method that automatically sends all new logs to Run:ai support servers for a set amount of time. See collecting logs for more information.</li> <li>It is now possible to mount an S3 bucket into a Run:ai Job. The option is only available via the command-line interface. For more information see runai submit.</li> <li>User interface improvements: The top navigation bar of the Run:ai user interface has been improved and now allows users to easily access everything related to the account, as well as multiple helpful links to the product documentation, CLI and APIs. </li> <li>Researcher Authentication configuration is now mandatory. </li> </ul>"},{"location":"home/whats-new-2022/#newly-supported-versions","title":"Newly Supported Versions","text":"<ul> <li>Run:ai now supports Kubernetes 1.24</li> <li>Run:ai now supports OpenShift 4.10</li> <li>Distributed training now supports MPI version 0.3. Support for older versions of MPI has been removed. </li> </ul>"},{"location":"home/whats-new-2022/#april-2022-runai-version-24-controlled-release-only","title":"April 2022 Run:ai Version 2.4 (Controlled Release only)","text":""},{"location":"home/whats-new-2022/#important-upgrade-note","title":"Important Upgrade Note","text":"<p>This version contains a significant change in the way that Run:ai uses and installs NVIDIA pre-requisites. Prior to this version, Run:ai has installed its own variants of two NVIDIA components: NVIDIA device plugin and NVIDIA DCGM Exporter. </p> <p>As these two variants are no longer needed, Run:ai now uses the standard NVIDIA installation which makes the Run:ai installation experience simpler. It does however require non-trivial changes when upgrading from older versions of Run:ai. </p> <p>Going forward, we also mandate the usage of the NVIDIA GPU Operator version 1.9. The Operator easies the installation of all NVIDIA software. Drivers and Kubernetes components alike. </p> <p>For further information see the Run:ai NVIDIA prerequisites as well as the Run:ai cluster upgrade.</p>"},{"location":"home/whats-new-2022/#dynamic-mig-support","title":"Dynamic MIG Support","text":"<p>Run:ai now supports the dynamic allocation of NVIDIA MIG slices. For further information see the document on fractions as well as the dynamic MIG quickstart.</p> <p>Other features:</p> <ul> <li>Run:ai now support fractions on GKE. GKE has a different software stack for NVIDIA. To install Run:ai on GKE please contact Run:ai customer support. </li> </ul>"},{"location":"home/whats-new-2022/#march-2022-runai-version-23","title":"March 2022 Run:ai Version 2.3","text":""},{"location":"home/whats-new-2022/#important-upgrade-note_1","title":"Important Upgrade Note","text":"<p>To upgrade to version 2.3 cluster from earlier versions, you must uninstall version 2.2 or earlier and only then install version 2.3. For detailed information see cluster upgrade.</p>"},{"location":"home/whats-new-2022/#unified-user-interface","title":"Unified User Interface","text":"<p>The Researcher user interface and the Administrator user interface have been unified into a single unified Run:ai user interface. The new user interface is served from <code>https://&lt;company-name&gt;.run.ai</code>. The user interface capabilities are subject to the role of the individual user. </p> <ul> <li>See instructions on how to set up the unified user interface. </li> <li>See user interface Jobs area for a description of how to submit, view and delete Jobs from the unified user interface. </li> </ul> <p>Other features:</p> <ul> <li>Additional information about scheduler decisions can now be found as part of the Job's status. View the Job status by running runai describe job or selecting a Job in the user interface and clicking <code>Status History</code>.</li> <li>Run:ai now support Charmed Kubernetes. </li> <li>Run:ai now supports orchestration of containerized virtual machines via KubeVirt. For more information see KubeVirt support.</li> <li>Run:ai now supports OpenShift 4.9, Kubernetes 1.22, and 1.23.</li> </ul>"},{"location":"home/whats-new-2022/#february-2022-runai-version-22-cloud-update-only","title":"February 2022 Run:ai Version 2.2 (Cloud update only)","text":"<ul> <li>When enabling Single-Sign, you can now use role groups. With groups, you no longer need to provide roles to individuals. Rather, you can create a group in the organization's directory and assign its members with specific Run:ai Roles such as Administrator, Researcher, and the like. For more information see single-sign-on.</li> <li>REST API has changed. The new API relies on <code>Applications</code>. See Calling REST APIs for more information. </li> <li>Added a new user role <code>Research Manager</code>. The role automatically assigns the user as a Researcher to all projects, including future projects. </li> </ul>"},{"location":"home/whats-new-2022/#january-2022-runai-version-20","title":"January 2022 Run:ai Version 2.0","text":"<p>We have now stabilized on a single version numbering system for all Run:ai artifacts: </p> <ul> <li>Run:ai Control plane.</li> <li>Run:ai Cluster.</li> <li>Run:ai Command-line interface.</li> <li>Run:ai Administrator Command-line interface.</li> </ul> <p>Future versions will be numbered using 2 digits (2.0, 2.1, 2.2, etc.). The numbering for the different artifacts will vary at the third digit as we provide patches to customers. As such, in the future, the control plane can be tagged as 2.1.0 while the cluster tagged as 2.1.1.</p>"},{"location":"home/whats-new-2022/#release-contents","title":"Release Contents","text":"<ul> <li>To allow for better control over resource allocation, the Run:ai platform now provides the ability to define different over-quota priorities for projects. For full details see Controlling over-quota behavior.</li> <li>To help review and track resource consumption per department, the Department object was added to multiple dashboard metrics.</li> </ul> <p>Supportability enhancements:</p> <ul> <li>A new tool was added, to allow IT administrators to validate cluster and control-plane installation prerequisites. For full details see cluster installation prerequisites, Kubernetes self-hosted prerequisites or OpenShift self-hosted prerequisites.</li> <li>To better analyze scheduling issues, the node name was added to multiple scheduler log events.</li> </ul>"},{"location":"snippets/common-submit-cli-commands/","title":"Common submit cli commands","text":""},{"location":"snippets/common-submit-cli-commands/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"snippets/common-submit-cli-commands/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"snippets/common-submit-cli-commands/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"snippets/common-submit-cli-commands/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"snippets/common-submit-cli-commands/#container-definition","title":"Container Definition","text":""},{"location":"snippets/common-submit-cli-commands/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"snippets/common-submit-cli-commands/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"snippets/common-submit-cli-commands/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"snippets/common-submit-cli-commands/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"snippets/common-submit-cli-commands/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"snippets/common-submit-cli-commands/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"snippets/common-submit-cli-commands/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#resource-allocation","title":"Resource Allocation","text":""},{"location":"snippets/common-submit-cli-commands/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"snippets/common-submit-cli-commands/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"snippets/common-submit-cli-commands/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#storage","title":"Storage","text":""},{"location":"snippets/common-submit-cli-commands/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#network","title":"Network","text":""},{"location":"snippets/common-submit-cli-commands/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#access-control","title":"Access Control","text":""},{"location":"snippets/common-submit-cli-commands/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#scheduling","title":"Scheduling","text":""},{"location":"snippets/common-submit-cli-commands/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"snippets/common-submit-cli-commands/#global-flags","title":"Global Flags","text":""},{"location":"snippets/common-submit-cli-commands/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Run:ai Documentation Library","text":"<p>Welcome to the Run:ai documentation area. For an introduction about what is the Run:ai Platform see Run:ai platform on the run.ai website.</p> <p>The Run:ai documentation is targeting three personas:</p> <ul> <li> <p>Run:ai Administrator - Is responsible for the setup and the day-to-day administration of the product. Administrator documentation can be found here.</p> </li> <li> <p>Researcher - Using Run:ai to submit Jobs. Researcher documentation can be found here.</p> </li> <li> <p>Developer - Using various APIs to manipulate Jobs and integrate with other systems. Developer documentation can be found here.</p> </li> </ul>"},{"location":"#how-to-get-support","title":"How to get support","text":"<p>To get support use the following channels:</p> <ul> <li> <p>On the Run:ai user interface at <code>&lt;company-name&gt;.run.ai</code>, use the 'Contact Support' link on the top right.</p> </li> <li> <p>Or submit a ticket by clicking the button below:</p> </li> </ul> <p>Submit a Ticket</p>"},{"location":"#community","title":"Community","text":"<p>Run:ai provides its customers with access to the Run:ai Customer Community portal in order to submit tickets, track ticket progress and update support cases.</p> <p>Customer Community Portal</p> <p>Reach out to customer support for credentials.</p>"},{"location":"#runai-cloud-status-page","title":"Run:ai Cloud Status Page","text":"<p>Run:ai cloud availability is monitored at status.run.ai.</p>"},{"location":"#collect-logs-to-send-to-support","title":"Collect Logs to Send to Support","text":"<p>As an IT Administrator, you can collect Run:ai logs to send to support:</p> <ul> <li>Install the Run:ai Administrator command-line interface.</li> <li>Run <code>runai-adm collect-logs</code>. The command will generate a compressed file containing all of the existing Run:ai log files.</li> </ul> <p>Note</p> <p>The tar file packages the logs of Run:ai components only. It does not include logs of researcher containers that may contain private information. </p>"},{"location":"#example-code","title":"Example Code","text":"<p>Code for the Docker images referred to on this site is available at https://github.com/run-ai/docs/tree/master/quickstart.</p> <p>The following images are used throughout the documentation:</p> Image Description Source gcr.io/run-ai-demo/quickstart Basic training image. Multi-GPU support https://github.com/run-ai/docs/tree/master/quickstart/main gcr.io/run-ai-demo/quickstart-distributed Distributed training using MPI and Horovod https://github.com/run-ai/docs/tree/master/quickstart/distributed zembutsu/docker-sample-nginx Build (interactive) with Connected Ports https://github.com/zembutsu/docker-sample-nginx gcr.io/run-ai-demo/quickstart-hpo Hyperparameter Optimization https://github.com/run-ai/docs/tree/master/quickstart/hpo gcr.io/run-ai-demo/quickstart-x-forwarding Use X11 forwarding from Docker image https://github.com/run-ai/docs/tree/master/quickstart/x-forwarding gcr.io/run-ai-demo/pycharm-demo Image used for tool integration (PyCharm and VSCode) https://github.com/run-ai/docs/tree/master/quickstart/python%2Bssh gcr.io/run-ai-demo/example-triton-client and  gcr.io/run-ai-demo/example-triton-server Basic Inference https://github.com/run-ai/models/tree/main/models/triton"},{"location":"#contributing-to-the-documentation","title":"Contributing to the documentation","text":"<p>This documentation is made better by a number of individuals from our customer and partner community. If you see something worth fixing, please comment at the bottom of the page or create a pull request via GitHub. The public GitHub repository can be found on the top-right of this page. </p>"},{"location":"Researcher/overview-researcher/","title":"Overview: Researcher Documentation","text":"<p>Researchers use Run:ai to submit jobs. </p> <p>As part of the Researcher documentation you will find:</p> <ul> <li>Quickstart Guides which provide step-by-step guides to Run:ai technology.</li> <li>Command line interface reference documentation.</li> <li>Best Practices for Deep Learning with Run:ai.</li> <li>Information about the Run:ai Scheduler.</li> <li>The Run:ai Python Researcher Library which you can optionally use in your container to get additional reporting and further resource optimization.</li> <li>Using Run:ai with various developer tools. </li> </ul>"},{"location":"Researcher/use-cases/","title":"Use Cases","text":"<p>This is a collection of various client-requested use cases. Each use case is accompanied by a short live-demo video, along with all the files used.</p> <p>Note</p> <p>For the most up-to-date information, check out the official Run:ai use-cases GitHub page.  </p> <ul> <li>MLflow with Run:ai: experiment management is important for Data Scientists. This is a demo of how to set up and use MLflow with Run:ai.  </li> <li>Introduction to Docker: Run:ai runs using Docker images. This is a brief introduction to Docker, image creation, and how to use them in the context of Run:ai. Please also check out the Persistent Environments use case if you wish to keep the creation of Docker images to a minimum.  </li> <li>Tensorboard with Jupyter (ResNet demo): Many Data Scientists like to use Tensorboard to keep an eye on the their current training experiments. They also like to have it side-by-side with Jupyter. In this demo, we will show how to integrate Tensorboard and Jupyter Lab within the context of Run:ai.  </li> <li>Persistent Environments (with Conda/Mamba &amp; Jupyter): Some Data Scientists find creating Docker images for every single one of their environments a bit of a hindrance. They would often prefer the ability to create and alter environments on the fly and to have those environments remain, even after an image has finished running in a job. This demo shows users how they can create and persist Conda/Mamba environments using an NFS.  </li> <li>Weights &amp; Biases with Run:ai: W&amp;B (Weights &amp; Biases) is one of the best tools for experiment tracking and management. W&amp;B is an official Run:ai partner. In this tutorial, we will demo how to use W&amp;B alongside Run:ai</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/","title":"Quickstart: Launch an Inference Workload","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#introduction","title":"Introduction","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster. There are additional prerequisites for running inference. See cluster installation prerequisites for more information. </li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> <li>You must have ML Engineer access rights. See Adding, Updating and Deleting Users for more information. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-inference/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#run-an-inference-workload","title":"Run an Inference Workload","text":"<ul> <li>In the Run:ai user interface go to <code>Deployments</code>. If you do not see the <code>Deployments</code> section you may not have the required access control, or the inference module is disabled. </li> <li>Select <code>New Deployment</code> on the top right.</li> <li>Select <code>team-a</code> as a project and add an arbitrary name. Use the image <code>gcr.io/run-ai-demo/example-triton-server</code>.</li> <li>Under <code>Resources</code> add 0.5 GPUs.</li> <li>Under <code>Auto Scaling</code> select a minimum of 1, a maximum of 2. Use the <code>concurrency</code> autoscaling threshold method. Add a threshold of 3.</li> <li>Add a <code>Container port</code> of <code>8000</code>.</li> </ul> <p>This would start an inference workload for team-a with an allocation of a single GPU. Follow up on the Job's progress using the Deployment list in the user interface or by running <code>runai list jobs</code></p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#query-the-inference-server","title":"Query the Inference Server","text":"<p>The specific inference server we just created is accepting queries over port 8000. You can use the Run:ai Triton demo client to send requests to the server:</p> <ul> <li>Find an IP address by running <code>kubectl get svc -n runai-team-a</code>. Use the <code>inference1-00001-private</code> Cluster IP.</li> <li>Replace <code>&lt;IP&gt;</code> below and run: </li> </ul> <pre><code> runai submit inference-client  -i gcr.io/run-ai-demo/example-triton-client \\\n    -- perf_analyzer -m inception_graphdef  -p 3600000 -u  &lt;IP&gt;\n</code></pre> <ul> <li>To see the result, run the following:</li> </ul> <pre><code>runai logs inference-client\n</code></pre>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under Deployments you can view the new Workload. When clicking the workload, note the utilization graphs go up. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#stop-workload","title":"Stop Workload","text":"<p>Use the user interface to delete the workload.</p>"},{"location":"Researcher/Walkthroughs/quickstart-inference/#see-also","title":"See also","text":"<ul> <li>You can also create Inference deployments via API. For more information see Submitting Workloads via YAML.</li> <li>See Deployment user interface.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/","title":"Quickstart: Launch Workloads with NVIDIA Dynamic MIG","text":""},{"location":"Researcher/Walkthroughs/quickstart-mig/#introduction","title":"Introduction","text":"<p>A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power. </p> <p>This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization. </p> <p>Run:ai provides two alternatives for splitting GPUs: Fractions and Dynamic MIG allocation. The focus of this article is Dynamic MIG allocation.  A detailed explanation of the two Run:ai offerings can be found here.</p>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> <li>A machine with a single available NVIDIA A100 GPU. This can be achieved by allocating filler workloads to the other GPUs on the node, or by using Google Cloud which allows for the creation of a virtual node with a single A100 GPU. </li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/quickstart-mig/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Allocate 2 GPUs to the Project.</li> <li>Mark the node as a dynamic MIG node as described here.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-mig/#run-an-inference-workload-single-replica","title":"Run an Inference Workload - Single Replica","text":"<p>At the GPU node level, run: <code>nvidia-smi</code>:</p> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   32C    P0    42W / 400W |      0MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| MIG devices:                                                                |\n+------------------+----------------------+-----------+-----------------------+\n| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |\n|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|\n|                  |                      |        ECC|                       |\n|==================+======================+===========+=======================|\n|  No MIG devices found                                                       |\n+-----------------------------------------------------------------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                                  |\n|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n|        ID   ID                                                   Usage      |\n|=============================================================================|\n|  No running processes found                                                 |\n+-----------------------------------------------------------------------------+\n</code></pre> <p>In the highlighted text above, note that:</p> <ul> <li>MIG is enabled (if <code>Enabled</code> has a star next to it, you need to reboot your machine).</li> <li>The GPU is not yet divided into devices.</li> </ul> <p>At the command-line run:</p> <pre><code>runai config project team-a\nrunai submit mig1 -i gcr.io/run-ai-demo/quickstart-cuda  --gpu-memory 10GB\nrunai submit mig2 -i gcr.io/run-ai-demo/quickstart-cuda  --mig-profile 2g.10gb \nrunai submit mig3 -i gcr.io/run-ai-demo/quickstart-cuda  --mig-profile 2g.10gb \n</code></pre> <p>We used two different methods to create MIG partitions: </p> <ol> <li>Stating the amount of GPU memory we require </li> <li>Requiring a partition of explicit size using NVIDIA terminology. </li> </ol> <p>Both methods achieve the same effect. They result in three MIG partitions of 10GB each. You can verify that by running <code>nvidia-smi</code>, at the GPU node level:</p> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   47C    P0   194W / 400W |  27254MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| MIG devices:                                                                |\n+------------------+----------------------+-----------+-----------------------+\n| GPU  GI  CI  MIG |         Memory-Usage |        Vol|         Shared        |\n|      ID  ID  Dev |           BAR1-Usage | SM     Unc| CE  ENC  DEC  OFA  JPG|\n|                  |                      |        ECC|                       |\n|==================+======================+===========+=======================|\n|  0    3   0   0  |   9118MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      4MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n|  0    4   0   1  |   9118MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      4MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n|  0    5   0   2  |   9016MiB /  9984MiB | 28      0 |  2   0    1    0    0 |\n|                  |      2MiB / 16383MiB |           |                       |\n+------------------+----------------------+-----------+-----------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                                  |\n|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n|        ID   ID                                                   Usage      |\n|=============================================================================|\n|    0    3    0     142213      C   ./quickstart                     9111MiB |\n|    0    4    0     146799      C   ./quickstart                     9111MiB |\n|    0    5    0     132219      C   ./quickstart                     9009MiB |\n+-----------------------------------------------------------------------------+\n</code></pre> <ul> <li>Highlighted above is a list of 3 MIG devices, each 10GB large. Total of 30GB (out of the 40GB on the GPU)</li> <li>You can also run the same command inside one of the containers: <code>runai exec mig1 nvidia-smi</code>. This will show a single device (the only one that the container sees from its point of view).</li> <li>Run: <code>runai list</code> to see the 3 jobs in <code>Running</code> state.</li> </ul> <p>We now want to allocate an interactive job with 20GB. Interactive jobs take precedence over the default training jobs:</p> <p><pre><code>runai submit mig1-int -i gcr.io/run-ai-demo/quickstart-cuda \\\n    --interactive --gpu-memory 20G \n</code></pre> or similarly, <pre><code>runai submit mig1-int -i gcr.io/run-ai-demo/quickstart-cuda \\\n    --interactive --mig-profile 3g.20gb  \n</code></pre></p> <p>Using <code>runai list</code> and <code>nvidia-smi</code> on the host machine, you can see that:</p> <ul> <li>One training job is preempted, and its device is deleted.</li> <li>The new, interactive job starts running.</li> </ul>"},{"location":"Researcher/Walkthroughs/quickstart-overview/","title":"Run:ai Quickstart Guides","text":"<p>Below are a set of Quickstart documents. The purpose of these documents is to get you acquainted with an aspect of Run:ai in the simplest possible form. Follow the Quickstart documents below to learn more:</p> <ul> <li>Unattended training sessions</li> <li>Interactive build sessions</li> <li>Interactive build sessions with externalized services</li> <li>Using GPU Fractions</li> <li>Distributed Training</li> <li>Hyperparameter Optimization</li> <li>Over-Quota, Basic Fairness &amp; Bin Packing</li> <li>Fairness</li> <li>Inference</li> <li>Dynamic MIG</li> </ul> <p>Most quickstarts rely on an image called <code>gcr.io/run-ai-demo/quickstart</code>. The image is based on  TensorFlow Release 20-08. This TensorFlow image has minimal requirements for CUDA and NVIDIA Compute Capability. </p> <p>If your GPUs do not meet these requirements, use <code>gcr.io/run-ai-demo/quickstart:legacy</code> instead. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/","title":"Quickstart: Launch Interactive Build Workloads with Connected Ports","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#introduction","title":"Introduction","text":"<p>This Quickstart is an extension of the Quickstart document: Start and Use Interactive Build Workloads </p> <p>When starting a container with the Run:ai Command-Line Interface (CLI), it is sometimes needed to expose internal ports to the user. Examples are: accessing a Jupyter notebook, using the container from a development environment such as PyCharm. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#exposing-a-container-port","title":"Exposing a Container Port","text":"<p>There are three ways to expose ports in Kubernetes: Port Forwarding, NodePort, and LoadBalancer. The first two will always work. The other requires a special setup by your administrator. The four methods are explained here. </p> <p>The document below provides an example based on Port Forwarding.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#port-forwarding-step-by-step-walkthrough","title":"Port Forwarding, Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named <code>team-a</code>.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#run-workload","title":"Run Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a\nrunai submit nginx-test -i zembutsu/docker-sample-nginx --interactive \\\n--service-type portforward --port 8080:80 </code></pre> <ul> <li>The Job is based on a sample NGINX webserver docker image <code>zembutsu/docker-sample-nginx</code>. Once accessed via a browser, the page shows the container name. </li> <li>Note the interactive flag which means the Job will not have a start or end. It is the Researcher's responsibility to close the Job.  </li> <li>In this example, we have chosen the simplest scheme to expose ports which is port forwarding. We temporarily expose port 8080 to localhost as long as the <code>runai submit</code> command is not stopped</li> <li>It is possible to forward traffic from multiple IP addresses by using the \"--address\" parameter. Check the CLI reference for further details. </li> </ul> <p>The result will be:</p> <pre><code>The job 'nginx-test-0' has been submitted successfully\nYou can run `runai describe job nginx-test-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0023] Job started\nOpen access point(s) to service from localhost:8080\nForwarding from 127.0.0.1:8080 -&gt; 80\nForwarding from [::1]:8080 -&gt; 80\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#access-the-webserver","title":"Access the Webserver","text":"<p>Open the following in the browser at http://localhost:8080.</p> <p>You should see a web page with the name of the container.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#stop-workload","title":"Stop Workload","text":"<p>Press Ctrl-C in the shell to stop port forwarding. Then delete the Job by running <code>runai delete job nginx-test</code></p>"},{"location":"Researcher/Walkthroughs/walkthrough-build-ports/#see-also","title":"See Also","text":"<ul> <li>Develop on Run:ai using Visual Studio Code</li> <li>Develop on Run:ai using PyCharm</li> <li>Use a Jupyter notbook with Run:ai.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/","title":"Quickstart: Launch Interactive Build Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#introduction","title":"Introduction","text":"<p>Deep learning workloads can be divided into two generic types:</p> <ul> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly. </li> <li>Unattended \"training\" sessions. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the customer can examine the results.</li> </ul> <p>With this Quickstart you will learn how to:</p> <ul> <li>Use the Run:ai command-line interface (CLI) to start a deep learning Build workload</li> <li>Open an ssh session to the Build workload</li> <li>Stop the Build workload</li> </ul> <p>It is also possible to open ports to specific services within the container. See \"Next Steps\" at the end of this article.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#step-by-step-quickstart","title":"Step by Step Quickstart","text":""},{"location":"Researcher/Walkthroughs/walkthrough-build/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#run-workload","title":"Run Workload","text":"<ul> <li> <p>At the command-line run:</p> <pre><code>runai config project team-a\nrunai submit build1 -i ubuntu -g 1 --interactive -- sleep infinity\n</code></pre> </li> <li> <p>The job is based on a sample docker image <code>ubuntu</code></p> </li> <li>We named the job build1.</li> <li>Note the interactive flag which means the job will not have a start or end. It is the Researcher's responsibility to close the job. </li> <li>The job is assigned to team-a with an allocation of a single GPU. </li> <li>The command provided is <code>sleep infinity</code>. You must provide a command or the container will start and then exit immediately. Alternatively, replace these flags with <code>--attach</code> to attach immediately to a session.</li> </ul> <p>Follow up on the job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Typical statuses you may see:</p> <ul> <li>ContainerCreating - The docker container is being downloaded from the cloud repository</li> <li>Pending - the job is waiting to be scheduled</li> <li>Running - the job is running</li> </ul> <p>A full list of Job statuses can be found here</p> <p>To get additional status on your job run:</p> <pre><code>runai describe job build1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#get-a-shell-to-the-container","title":"Get a Shell to the container","text":"<p>Run:</p> <pre><code>runai bash build1\n</code></pre> <p>This should provide a direct shell into the computer</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under \"Jobs\" you can view the new Workload:</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#stop-workload","title":"Stop Workload","text":"<p>Run the following:</p> <pre><code>runai delete job build1\n</code></pre> <p>This would stop the training workload. You can verify this by running <code>runai list jobs</code> again.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-build/#next-steps","title":"Next Steps","text":"<ul> <li>Expose internal ports to your interactive build workload: Quickstart Launch an Interactive Build Workload with Connected Ports.</li> <li>Follow the Quickstart document: Launch Unattended Training Workloads.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/","title":"Quickstart: Launch Distributed Training Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#introduction","title":"Introduction","text":"<p>Distributed Training is the ability to split the training of a model among multiple processors. Each processor is called a worker node. Worker nodes work in parallel to speed up model training. Distributed Training should not be confused with multi-GPU training. Multi-GPU training is the allocation of more than a single GPU to your workload which runs on a single container.</p> <p>Getting Distributed Training to work is more complex than multi-GPU training as it requires syncing of data and timing between the different workers. However, it is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Several Deep Learning frameworks support Distributed Training. Horovod is a good example.</p> <p>Run:ai provides the ability to run, manage, and view Distributed Training workloads. The following is a Quickstart document for such a scenario.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>During the installation, you have installed the Kubeflow MPI Operator as specified here</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#run-training-distributed-workload","title":"Run Training Distributed Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a\nrunai submit-dist mpi --processes=2 -g 1 \\\n-i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n</code></pre> <ul> <li>We named the Job dist</li> <li>The Job is assigned to team-a</li> <li>There will be two worker processes (--processes=2), each allocated with a single GPU (-g 1)</li> <li>The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart-distributed:v0.3.0</code>.</li> <li>The image contains a startup script that runs a deep learning Horovod-based workload.</li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>    runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>The Run:ai scheduler ensures that all processes can run together. You can see the list of workers as well as the main \"launcher\" process by running:</p> <pre><code>    runai describe job dist\n</code></pre> <p>You will see two worker processes (pods) their status and on which node they run:</p> <p></p> <p>To see the merged logs of all pods run:</p> <pre><code>    runai logs dist\n</code></pre> <p>Finally, you can delete the distributed training workload by running:</p> <pre><code>    runai delete job dist\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#run-an-interactive-distributed-workload","title":"Run an Interactive Distributed Workload","text":"<p>It is also possible to run a distributed training Job as \"interactive\". This is useful if you want to test your distributed training Job before committing on a long, unattended training session. To run such a session use:</p> <pre><code>runai submit-dist mpi dist-int --processes=2 -g 1 \\\n-i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 --interactive \\\n-- sh -c \"sleep infinity\" </code></pre> <p>When the workers are running run:</p> <pre><code>    runai bash dist-int\n</code></pre> <p>This will provide shell access to the launcher process. From there, you can run your distributed workload. For Horovod version smaller than 0.17.0 run:</p> <pre><code>horovodrun -np $RUNAI_MPI_NUM_WORKERS \\\npython scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \\\n--model=resnet20 --num_batches=1000000 --data_name cifar10 \\\n--data_dir /cifar10 --batch_size=64 --variable_update=horovod\n</code></pre> <p>For Horovod version 0.17.0 or later, add the <code>-hostfile</code> flag as follows:</p> <pre><code>horovodrun -np $RUNAI_MPI_NUM_WORKERS -hostfile /etc/mpi/hostfile \\\npython scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py \\\n--model=resnet20 --num_batches=1000000 --data_name cifar10 \\\n--data_dir /cifar10 --batch_size=64 --variable_update=horovod </code></pre> <p>The environment variable <code>RUNAI_MPI_NUM_WORKERS</code> is passed by Run:ai and contains the number of worker processes provided to the <code>runai submit-dist mpi</code> command (in the above example the value is 2).</p>"},{"location":"Researcher/Walkthroughs/walkthrough-distributed-training/#see-also","title":"See Also","text":"<ul> <li>The source code of the image used in this Quickstart document is in Github</li> <li>For a full list of the <code>submit-dist mpi</code> options see runai submit-dist mpi</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/","title":"Quickstart: Launch Workloads with GPU Fractions","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#introduction","title":"Introduction","text":"<p>Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU, enabling companies to run more workloads such as computer vision, voice recognition and natural language processing on the same hardware, lowering costs.</p> <p>Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. This enables several workloads to run in containers side-by-side on the same GPU without interfering with each other. The solution is transparent, simple, and portable; it requires no changes to the containers themselves.</p> <p>A typical use-case could see 2-8 Jobs running on the same GPU, meaning you could do eight times the work with the same hardware. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 1 GPU to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#run-workload","title":"Run Workload","text":"<ul> <li> <p>At the command-line run:</p> <pre><code>runai config project team-a\n\nrunai submit frac05 -i gcr.io/run-ai-demo/quickstart -g 0.5 --interactive\nrunai submit frac03 -i gcr.io/run-ai-demo/quickstart -g 0.3\n</code></pre> </li> <li> <p>The Jobs are based on a sample docker image <code>gcr.io/run-ai-demo/quickstart</code> the image contains a startup script that runs a deep learning TensorFlow-based workload.</p> </li> <li>We named the Jobs frac05 and frac03 respectively. </li> <li>Note that fractions may or may not use the <code>--interactive</code> flag. Setting the flag means that the Job will not automatically finish. Rather, it is the Researcher's responsibility to delete the Job. Fractions support both Interactive and non-interactive Jobs. </li> <li>The Jobs are assigned to team-a with an allocation of a single GPU. </li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Note that both Jobs were allocated to the same node.</p> <p>When both Jobs are running, bash into one of them:</p> <pre><code>runai bash frac05\n</code></pre> <p>Now, inside the container,  run: </p> <pre><code>nvidia-smi\n</code></pre> <p>The result:</p> <p></p> <p>Notes:</p> <ul> <li>The total memory is circled in red. It should be 50% of the GPUs memory size. In the picture above we see 8GB which is half of the 16GB of Tesla V100 GPUs.</li> <li>The script running on the container is limited by 8GB. In this case, TensorFlow, which tends to allocate almost all of the GPU memory has allocated 7.7GB RAM (and not close to 16 GB). Overallocation beyond 8GB will lead to an out-of-memory exception </li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-fractions/#use-exact-gpu-memory","title":"Use Exact GPU Memory","text":"<p>Instead of requesting a fraction of the GPU, you can ask for specific GPU memory requirements. For example:</p> <pre><code>runai submit  -i gcr.io/run-ai-demo/quickstart --gpu-memory 5G\n</code></pre> <p>Which will provide 5GB of GPU memory. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/","title":"Quickstart: Hyperparameter Optimization","text":""},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#introduction","title":"Introduction","text":"<p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter can be a parameter whose value is used to control the learning process, to define the model architecture or the data pre-processing process, etc. Example hyperparameters: learning rate, batch size, different optimizers, number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine results to decide what works best.</p> <p>There are several strategies for searching the hyperparameter space. Most notable are Random search and Grid search. The former, as its name implies, selects parameters at random while the latter does an exhaustive search from a list of pre-selected values.</p> <p>Run:ai provides the ability to run, manage, and view HPO runs. The following is a Quickstart of such a scenario.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> <li>On shared storage create a library to store HPO results. E.g. <code>/nfs/john/hpo</code>.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#pods","title":"Pods","text":"<p>With HPO, we introduce the concept of Pods. Pods are units of work within a Job. </p> <ul> <li>Typically, each Job has a single Pod. However, with HPO as well as with Distributed Training there are multiple Pods per Job. </li> <li>Pods are independent</li> <li>All Pods execute with the same arguments as added via <code>runai submit</code>. E.g. The same image name, the same code script, the same number of Allocated GPUs, and memory.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#hpo-sample-code","title":"HPO Sample Code","text":"<p>The Quickstart code uses the Run:ai HPO python library github.com/run-ai/docs. And needs to be installed within the image. Below are some highlights of the code: </p> <pre><code># import Run:ai HPO library\nimport runai.hpo\n# select Random search or grid search\nstrategy = runai.hpo.Strategy.GridSearch\n# initialize the Run:ai HPO library. Send the NFS directory used for sync\nrunai.hpo.init(\"/nfs\")\n# pick a configuration for this HPO experiment\n# we pass the options of all hyperparameters we want to test\n# `config` will hold a single value for each parameter\nconfig = runai.hpo.pick(\ngrid=dict(\nbatch_size=[32, 64, 128],\nlr=[1, 0.1, 0.01, 0.001]),\nstrategy=strategy)\n....\n# Use the selected configuration within your code\noptimizer = keras.optimizers.SGD(lr=config['lr'])\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#run-an-hpo-workload","title":"Run an HPO Workload","text":"<ul> <li>At the command-line run:</li> </ul> <pre><code>runai config project team-a \nrunai submit hpo1 -i gcr.io/run-ai-demo/quickstart-hpo -g 1 \\\n    --parallelism 3 --completions 12 -v /nfs/john/hpo:/nfs\n</code></pre> <ul> <li>We named the Job hpo1</li> <li>The Job is assigned to team-a</li> <li>The Job will be complete when 12 pods will run (--completions 12), each allocated with a single GPU (-g 1)</li> <li>At most, there will be 3 pods running concurrently (--parallelism 3)</li> <li>The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart-hpo</code>. The image contains a startup script that selects a set of hyperparameters and then uses them, as described above. </li> <li>The command maps a shared volume <code>/nfs/john/hpo</code> to a directory in the container <code>/nfs</code>. The running pods will use the directory to sync hyperparameters and save results.</li> </ul> <p>Follow up on the Job's status by running:</p> <pre><code>runai list jobs\n</code></pre> <p>The result:</p> <p></p> <p>Follow up on the Job's pods by running:</p> <pre><code>runai describe job hpo1 \n</code></pre> <p>You will see 3 running pods currently executing:</p> <p></p> <p>Once the 3 pods are done, they will be replaced by new ones from the 12 completions. This process will continue until all 12 have run.</p> <p>You can also submit Jobs on another Project until only 2 GPUs remain. This will preempt 1 pod and will henceforth limit the HPO Job to run on 2 pods only. Preempted pods will be picked up and ran later.</p> <p>You can see logs of specific pods by running :</p> <pre><code>runai logs hpo1 --pod &lt;POD-NAME&gt;\n</code></pre> <p>where <code>&lt;&lt;POD-NAME&gt;&gt;</code> is a pod name as appears above in the <code>runai describe job hpo1</code> output </p> <p>The logs will contain a couple of lines worth noting:</p> <p>Picked HPO experiment #4</p> <p>...</p> <p>Using HPO directory /hpo</p> <p>Using configuration: {'batch_size': 32, 'lr': 0.001}</p>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#examine-the-results","title":"Examine the Results","text":"<p>The Run:ai HPO library saves the experiment variations and the experiment results to a single file, making it easier to pick the best HPO run. The file can be found in the shared folder. Below is a snapshot of the file for two experiments with two epochs each:</p> <pre><code>creationTime: 24/08/2020 08:50:06\nexperiments:\n- config:\nbatch_size: 32\nlr: 1\nid: 1\nmodificationTime: 24/08/2020 08:50:06\nreports:\n- epoch: 0\nmetrics:\nacc: 0.09814\nloss: 2.310984723968506\nval_acc: 0.1\nval_loss: 2.3098626373291014\nreportTime: 24/08/2020 08:52:11\n- epoch: 1\nmetrics:\nacc: 0.09914\nloss: 2.30994320602417\nval_acc: 0.1\nval_loss: 2.3110838134765626\nreportTime: 24/08/2020 08:54:10\n- config:\nbatch_size: 32\nlr: 0.1\nid: 2\nmodificationTime: 24/08/2020 08:50:36\nreports:\n- epoch: 0\nmetrics:\nacc: 0.11012\nloss: 2.2979678358459474\nval_acc: 0.1667\nval_loss: 2.268467852783203\nreportTime: 24/08/2020 08:52:44\n- epoch: 1\nmetrics:\nacc: 0.2047\nloss: 2.0894255745697023\nval_acc: 0.2833\nval_loss: 1.8615504817962647\nreportTime: 24/08/2020 08:54:45\n</code></pre> <p>Finally, you can delete the HPO Job by running:</p> <pre><code>    runai delete job hpo1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-hpo/#see-also","title":"See Also","text":"<p>For further information on the Run:ai HPO support library see:</p> <ul> <li>The Run:ai HPO Support Library</li> <li>Sample code in Github</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/","title":"Quickstart: Over-Quota and Bin Packing","text":""},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#goals","title":"Goals","text":"<p>The goal of this Quickstart is to explain the concepts of over-quota and bin-packing (consolidation) and how they help in maximizing cluster utilization: </p> <ul> <li>Show the simplicity of resource provisioning, and how resources are abstracted from users.</li> <li>Show how the system eliminates compute bottlenecks by allowing teams/users to go over their resource quota if there are free GPUs in the cluster.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#setup-and-configuration","title":"Setup and configuration:","text":"<ul> <li>4 GPUs on 2 machines with 2 GPUs each</li> <li>2 Projects: team-a and team-b with 2 allocated GPUs each</li> <li>Run:ai canonical image gcr.io/run-ai-demo/quickstart</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-i-over-quota","title":"Part I: Over-quota","text":"<p>Run the following commands:</p> <pre><code>runai submit a2 -i gcr.io/run-ai-demo/quickstart -g 2 -p team-a\nrunai submit a1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit b1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>team-a has 3 GPUs allocated. Which is over its quota by 1 GPU. </li> <li>The system allows this over-quota as long as there are available resources</li> <li>The system is at full capacity with all GPUs utilized. </li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-2-basic-fairness-via-preemption","title":"Part 2: Basic Fairness via Preemption","text":"<p>Run the following command:</p> <pre><code>runai submit b2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>team-a can no longer remain in over-quota. Thus, one Job, must be preempted: moved out to allow team-b to grow.</li> <li>Run:ai scheduler chooses to preempt Job a1.</li> <li>It is important that unattended Jobs will save checkpoints. This will ensure that whenever Job a1 resume, it will do so from where it left off.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-overquota/#part-3-bin-packing","title":"Part 3: Bin Packing","text":"<p>Run the following command:</p> <pre><code>runai delete job a2 -p team-a\n</code></pre> <p>a1 is now going to start running again.</p> <p>Run:</p> <pre><code>runai list jobs -A\n</code></pre> <p>You have two Jobs that are running on the first node and one Job that is running alone the second node. </p> <p>Choose one of the two Jobs from the full node and delete it:</p> <pre><code>runai delete job &lt;job-name&gt; -p &lt;project&gt;\n</code></pre> <p>The status now is: </p> <p>Now, run a 2 GPU Job:</p> <pre><code>runai submit a2 -i gcr.io/run-ai-demo/quickstart -g 2 -p team-a\n</code></pre> <p>The status now is: </p> <p>Discussion</p> <p>Note that Job a1 has been preempted and then restarted on the second node, in order to clear space for the new a2 Job. This is bin-packing or consolidation</p>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/","title":"Quickstart: Queue Fairness","text":""},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#goal","title":"Goal","text":"<p>The goal of this Quickstart is to explain fairness. The over-quota Quickstart shows basic fairness where allocated GPUs per Project are adhered to such that if a Project is in over-quota, its Job will be preempted once another Project requires its resources.</p> <p>This Quickstart is about queue fairness. It shows that Jobs will be scheduled fairly regardless of the time they have been submitted. As such, if a person in Project A has submitted 50 Jobs and soon after that, a person in Project B has submitted 25 Jobs, the Jobs in the queue will be processed fairly.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#setup-and-configuration","title":"Setup and configuration:","text":"<ul> <li>4 GPUs on 2 machines with 2 GPUs each.</li> <li>2 Projects: team-a and team-b with 1 allocated GPU each.</li> <li>Run:ai canonical image gcr.io/run-ai-demo/quickstart</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-i-immediate-displacement-of-over-quota","title":"Part I: Immediate Displacement of Over-Quota","text":"<p>Run the following commands:</p> <pre><code>runai submit a1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a3 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\nrunai submit a4 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-a\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <p>team-a, even though it has a single GPU as quota, is now using all 4 GPUs.</p> <p>Run the following commands:</p> <pre><code>runai submit b1 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b2 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b3 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\nrunai submit b4 -i gcr.io/run-ai-demo/quickstart -g 1 -p team-b\n</code></pre> <p>System status after run: </p> <p>Discussion</p> <ul> <li>Two team-b Jobs have immediately displaced team-a. </li> <li>team-a and team-b each have a quota of 1 GPU, thus the remaining over-quota (2 GPUs) is distributed equally between the Projects.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-queue-fairness/#part-2-queue-fairness","title":"Part 2: Queue Fairness","text":"<p>Now lets start deleting Jobs. Alternatively, you can wait for Jobs to complete.</p> <pre><code>runai delete job b2 -p team-b\n</code></pre> <p>Discussion</p> <p>As the quotas are equal (1 for each Project, the remaining pending Jobs will get scheduled one by one alternating between Projects, regardless of the time in which they were submitted. </p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/","title":"Quickstart: Launch Unattended Training Workloads","text":""},{"location":"Researcher/Walkthroughs/walkthrough-train/#introduction","title":"Introduction","text":"<p>Deep learning workloads can be divided into two generic types:</p> <ul> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly.</li> <li>Unattended \"training\" sessions. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the customer can examine the results.</li> </ul> <p>With this Quickstart you will learn how to:</p> <ul> <li>Use the Run:ai command-line interface (CLI) to start a deep learning training workload.</li> <li>View training status and resource consumption using the Run:ai user interface and the Run:ai CLI.</li> <li>View training logs.</li> <li>Stop the training.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#prerequisites","title":"Prerequisites","text":"<p>To complete this Quickstart you must have:</p> <ul> <li>Run:ai software installed on your Kubernetes cluster. See: Installing Run:ai on a Kubernetes Cluster</li> <li>Run:ai CLI installed on your machine. See: Installing the Run:ai Command-Line Interface</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#step-by-step-walkthrough","title":"Step by Step Walkthrough","text":""},{"location":"Researcher/Walkthroughs/walkthrough-train/#setup","title":"Setup","text":"<ul> <li>Login to the Projects area of the Run:ai user interface.</li> <li>Add a Project named \"team-a\".</li> <li>Allocate 2 GPUs to the Project.</li> </ul>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#run-workload","title":"Run Workload","text":"<ul> <li>At the command-line run:<pre><code>runai config project team-a\nrunai submit train1 -i gcr.io/run-ai-demo/quickstart -g 1\n</code></pre> </li> </ul> <p>This would start an unattended training Job for team-a with an allocation of a single GPU. The Job is based on a sample docker image <code>gcr.io/run-ai-demo/quickstart</code>. We named the Job <code>train1</code></p> <ul> <li>Follow up on the Job's progress by running:<pre><code>runai list jobs\n</code></pre> </li> </ul> <p>The result:</p> <p></p> <p>Typical statuses you may see:</p> <ul> <li>ContainerCreating - The docker container is being downloaded from the cloud repository</li> <li>Pending - the Job is waiting to be scheduled</li> <li>Running - the Job is running</li> <li>Succeeded - the Job has ended</li> </ul> <p>A full list of Job statuses can be found here </p> <p>To get additional status on your Job run:</p> <pre><code>runai describe job train1\n</code></pre>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#view-logs","title":"View Logs","text":"<p>Run the following:</p> <pre><code>runai logs train1\n</code></pre> <p>You should see a log of a running deep learning session:</p> <p></p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#view-status-on-the-runai-user-interface","title":"View status on the Run:ai User Interface","text":"<ul> <li>Open the Run:ai user interface.</li> <li>Under \"Jobs\" you can view the new Workload:</li> </ul> <p>The image we used for training includes the Run:ai Training library. Among other features, this library allows the reporting of metrics from within the deep learning Job. Metrics such as progress, accuracy, loss, and epoch and step numbers.  </p> <ul> <li>Progress can be seen in the status column above. </li> <li>To see other metrics, press the settings wheel on the top right  and select additional deep learning metrics from the list</li> </ul> <p>Under Nodes you can see node utilization:</p> <p></p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#stop-workload","title":"Stop Workload","text":"<p>Run the following:</p> <pre><code>runai delete job train1\n</code></pre> <p>This would stop the training workload. You can verify this by running <code>runai list jobs</code> again.</p>"},{"location":"Researcher/Walkthroughs/walkthrough-train/#next-steps","title":"Next Steps","text":"<ul> <li>Follow the Quickstart document: Launch Interactive Workloads</li> <li>Use your container to run an unattended training workload</li> </ul>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/","title":"Best Practice: From Bare Metal to Docker Images","text":""},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#introduction","title":"Introduction","text":"<p>Some Researchers do data science on bare metal. The term bare-metal relates to connecting to a server and working directly on its operating system and disks.</p> <p>This is the fastest way to start working, but it introduces problems when the data science organization scales:</p> <ul> <li>More Researchers mean that the machine resources need to be efficiently shared</li> <li>Researchers need to collaborate and share data, code, and results</li> </ul> <p>To overcome that, people working on bare-metal typically write scripts to gather data, code as well as code dependencies. This soon becomes an overwhelming task.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#why-use-docker-images","title":"Why Use Docker Images?","text":"<p>Docker images and containerization in general provide a level of abstraction which, by large, frees developers and Researchers from the mundane tasks of setting up an environment. The image is an operating system by itself and thus the 'environment' is by large, a part of the image.</p> <p>When a docker image is instantiated, it creates a container. A container is the running manifestation of a docker image.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#moving-a-data-science-environment-to-docker","title":"Moving a Data Science Environment to Docker","text":"<p>A data science environment typically includes:</p> <li>Training data</li> <li>Machine Learning (ML) code and inputs</li> <li>Libraries: Code dependencies that must be installed before the ML code can be run</li>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-data","title":"Training data","text":"<p>Training data is usually significantly large (from several Gigabytes to Petabytes) and is read-only in nature. Thus, training data is typically left outside of the docker image. Instead, the data is mounted onto the image when it is instantiated. Mounting a volume allows the code within the container to access the data as though it was within a directory on the local file system.</p> <p>The best practice is to store the training data on a shared file system. This allows the data to be accessed uniformly on whichever machine the Researcher is currently using, allowing the Researcher to easily migrate between machines. </p> <p>Organizations without a shared file system typically write scripts to copy data from machine to machine.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#machine-learning-code-and-inputs","title":"Machine Learning Code and Inputs","text":"<p>As a rule, code needs to be saved and versioned in a code repository.</p> <p>There are two alternative practices:</p> <ul> <li>The code resides in the image and is being periodically pulled from the repository. This practice requires building a new container image each time a change is introduced to the code.</li> <li>When a shared file system exists, the code can reside outside the image on a shared disk and mounted via a volume onto the container. </li> </ul> <p>Both practices are valid.</p> <p>Inputs to machine learning models and artifacts of training sessions, like model checkpoints, are also better stored in and loaded from a shared file system.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#code-dependencies","title":"Code Dependencies","text":"<p>Any code has code dependencies. These libraries must be installed for the code to run. As the code is changing, so do the dependencies.</p> <p>ML Code is typically python and python dependencies are typically declared together in a single <code>requirements.txt</code> file which is saved together with the code.</p> <p>The best practice is to have your docker startup script (see below) run this file using <code>pip install -r requirements.txt</code>. This allows the flexibility of adding and removing code dependencies dynamically.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#ml-lifecycle-build-and-train","title":"ML Lifecycle: Build and Train","text":"<p>Deep learning workloads can be divided into two generic types:</p> <li>Interactive \"build\" sessions. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter Notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads are typically meant for debugging and development sessions. </li> <li>Unattended \"training\" sessions. Training is characterized by a machine learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. During the execution, the data scientist can examine the results. A Training session can take from a few minutes to a couple of days. It can be interrupted in the middle and later restored (though the data scientist should save checkpoints for that purpose). Training workloads typically utilize large percentages of the GPU and at the end of the run automatically frees the resources. </li> <p>Getting your docker ready is also a matter of which type of workload you are currently running.</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#build-workloads","title":"Build Workloads","text":"<p>With \"build\" you are actually coding and debugging small experiments. You are interactive. In that mode, you can typically take a well known standard image (e.g. https://ngc.nvidia.com/catalog/containers/nvidia:tensorflow) and use it directly.</p> <p>Start a docker container by running:</p> <pre><code>docker run -it .... \"the well known image\" -v /where/my/code/resides bash </code></pre> <p>You get a shell prompt to a container with a mounted volume of where your code is. You can then install your prerequisites and run your code via ssh.</p> <p>You can also access the container remotely from tools such as PyCharm, Jupyter Notebook, and more. In this case, the docker image needs to be customized to install the \"server software\" (e.g. a Jupyter Notebook service).</p>"},{"location":"Researcher/best-practices/bare-metal-to-docker-images/#training-workloads","title":"Training Workloads","text":"<p>For training workloads, you can use a well-known image (e.g. the TensorFlow image from the link above) but more often than not, you want to create your own docker image. The best practice is to use the well-known image (e.g. TensorFlow from above) as a base image and add your own customizations on top of it. To achieve that, you create a Dockerfile. A Dockerfile is a declarative way to build a docker image and is built in layers. e.g.:</p> <ol><li>Base image is nvidia-tensorflow</li> <li>Install popular software</li> <li>(Optional) Run a script</li> </ol> <p>The script can be part of the image or can be provided as part of the command line to run the docker. It will typically include additional dependencies to install as well as a reference to the ML code to be run. </p> <p>The best practice for running training workloads is to test the container image in a \"build\" session and then send it for execution as a training Job. For further information on how to set up and parameterize a training workload via docker or Run:ai see Converting your Workload to use Unattended Training Execution.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/","title":"Best Practice: Convert your Workload to Run Unattended","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#motivation","title":"Motivation","text":"<p>Run:ai allows non-interactive training workloads to extend beyond guaranteed quotas and into over-quota as long as computing resources are available. To achieve this kind of flexibility, the system needs to be able to safely stop a workload and restart it again later. This requires Researchers to switch workloads from running interactively, to running unattended, thus allowing Run:ai to pause/resume the run.</p> <p>Unattended workloads are a good fit for long-duration runs, or sets of smaller hyperparameter optimization runs.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#best-practices","title":"Best Practices","text":""},{"location":"Researcher/best-practices/convert-to-unattended/#docker-image","title":"Docker Image","text":"<p>A docker container is based on a docker image. Some Researchers use generic images such as ones provided by Nvidia, for example: NVIDIA NGC TensorFlow.  Others, use generic images as the base image to a more customized image using Dockerfiles.</p> <p>Realizing that Researchers are not always proficient with building docker files, as a best practice, you will want to:</p> <ul> <li>Use the same docker image both for interactive and unattended jobs. In this way, you can keep the difference between both methods of invocation to a minimum. This can be a stock image from Nvidia or a custom image.</li> <li>Leave some degree of flexibility, which allows the Researcher to add/remove python dependencies without re-creating images.</li> </ul>"},{"location":"Researcher/best-practices/convert-to-unattended/#code-location","title":"Code Location","text":"<p>You will want to minimize the cycle of code change-and-run. There are a couple of best practices which you can choose from:</p> <ol> <li>Code resides on the network file storage. This way you can change the code and immediately run the Job. The Job picks up the new files from the network.</li> <li>Use the <code>runai submit</code> flag <code>--git-sync</code>. The flag allows the Researcher to provide details of a Git repository. The repository will be automatically cloned into a specified directory when the container starts.</li> <li>The code can be embedded within the image. In this case, you will want to create an automatic CI/CD process, which packages the code into a modified image. </li> </ol> <p>The document below assumes option #1. </p>"},{"location":"Researcher/best-practices/convert-to-unattended/#create-a-startup-script","title":"Create a Startup Script","text":"<p>Gather the commands you ran inside the interactive Job into a single script. The script will be provided with the command-line at the start of the unattended execution (see the section running the job below). This script should be kept next to your code, on a shared network drive (e.g. /nfs/john).</p> <p>An example of a common startup script start.sh:</p> <pre><code>pip install -r requirements.txt\n...\npython training.py\n</code></pre> <p>The first line of this script is there to make sure that all required python libraries are installed before the training script executes, it also allows the Researcher to add/remove libraries without needing changes to the image itself.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#support-variance-between-different-runs","title":"Support Variance Between Different Runs","text":"<p>Your training script must be flexible enough to support variance in execution without changing the code. For example, you will want to change the number of epochs to run, apply a different set of hyperparameters, etc. There are two ways to handle this in your script. You can use one or both methods:</p> <ol> <li> <p>Your script can read arguments passed to the script:</p> <p><pre><code>python training.py --number-of-epochs=30</code></pre></p> </li> </ol> <p>In which case, change your start.sh script to:</p> <pre><code>pip install -r requirements.txt\n...\npython training.py $@</code></pre> <ol> <li>Your script can read from environment variables during script execution. In case you use environment variables, the variables will be passed to the training script automatically. No special action is required in this case.</li> </ol>"},{"location":"Researcher/best-practices/convert-to-unattended/#checkpoints","title":"Checkpoints","text":"<p>Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save your weights at various checkpoints and start a workload from the latest checkpoint (typically between epochs).</p> <p>TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).</p> <p>It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node</p> <p>For more information on best practices for saving checkpoints, see Saving Deep Learning Checkpoints.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#running-the-job","title":"Running the Job","text":"<p>Using <code>runai submit</code>, drop the flag <code>--interactive</code>. For submitting a Job using the script created above, please use <code>-- [COMMAND]</code> flag to specify a command, use the <code>--</code> syntax to pass arguments, and pass environment variables using the flag <code>--environment</code>.</p> <p>Example with Environment variables:</p> <pre><code>runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -e 'EPOCHS=30'  -e 'LEARNING_RATE=0.02'  \n    -- ./startup.sh  \n</code></pre> <p>Example with Command-line arguments:</p> <pre><code>runai submit train1 -i tensorflow/tensorflow:1.14.0-gpu-py3  \n    -v /nfs/john:/mydir -g 1  --working-dir /mydir/  \n    -- ./startup.sh batch-size=64 number-of-epochs=3\n</code></pre> <p>Please refer to Command-Line Interface, runai submit for a list of all arguments accepted by the Run:ai CLI.</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#use-cli-policies","title":"Use CLI Policies","text":"<p>Different run configurations may vary significantly and can be tedious to be written each time on the command-line. To make life easier, our CLI offers a way to set administrator policies for these configurations and use pre-configured configuration when submitting a Workload. Please refer to Configure Command-Line Interface Policies. </p>"},{"location":"Researcher/best-practices/convert-to-unattended/#attached-files","title":"Attached Files","text":"<p>The 3 relevant files mentioned in this document can be downloaded from Github</p>"},{"location":"Researcher/best-practices/convert-to-unattended/#see-also","title":"See Also","text":"<p>See the unattended training Quickstart: Launch Unattended Training Workloads</p>"},{"location":"Researcher/best-practices/env-variables/","title":"Environment Variables inside a Run:ai Workload","text":""},{"location":"Researcher/best-practices/env-variables/#identifying-a-job","title":"Identifying a Job","text":"<p>There may be use cases where your container may need to uniquely identify the Job it is currently running in. A typical use case is for saving Job artifacts under a unique name.  Run:ai provides pre-defined environment variables you can use. These variables are guaranteed to be unique even if the Job is preempted or evicted and then runs again. </p> <p>Run:ai provides the following environment variables:</p> <ul> <li><code>JOB_NAME</code> - the name of the Job.</li> <li><code>JOB_UUID</code> - a unique identifier for the Job. </li> </ul> <p>Note that the Job can be deleted and then recreated with the same name. A Job UUID will be different even if the Job names are the same.</p>"},{"location":"Researcher/best-practices/env-variables/#identifying-a-pod","title":"Identifying a Pod","text":"<p>With Hyperparameter Optimization, experiments are run as Pods within the Job. Run:ai provides the following environment variables to identify the Pod.</p> <ul> <li><code>POD_INDEX</code> -  An index number (0, 1, 2, 3....) for a specific Pod within the Job. This is useful for Hyperparameter Optimization to allow easy mapping to individual experiments. The Pod index will remain the same if restarted (due to a failure or preemption). Therefore, it can be used by the Researcher to identify experiments. </li> <li><code>POD_UUID</code> - a unique identifier for the Pod. if the Pod is restarted, the Pod UUID will change.</li> </ul>"},{"location":"Researcher/best-practices/env-variables/#gpu-allocation","title":"GPU Allocation","text":"<p>Run:ai provides an environment variable, visible inside the container, to help identify the number of GPUs allocated for the container. Use <code>RUNAI_NUM_OF_GPUS</code></p>"},{"location":"Researcher/best-practices/env-variables/#node-name","title":"Node Name","text":"<p>There may be use cases where your container may need to identify the node it is currently running on. Run:ai provides an environment variable, visible inside the container, to help identify the name of the node on which the pod was scheduled. Use <code>NODE_NAME</code></p>"},{"location":"Researcher/best-practices/env-variables/#usage-example-in-python","title":"Usage Example in Python","text":"<pre><code>import os\njobName = os.environ['JOB_NAME']\njobUUID = os.environ['JOB_UUID']\n</code></pre>"},{"location":"Researcher/best-practices/save-dl-checkpoints/","title":"Best Practice: Save Deep-Learning Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#introduction","title":"Introduction","text":"<p>Run:ai can pause unattended executions, giving your GPU resources to another workload. When the time comes, Run:ai will give you back the resources and restore your workload. Thus, it is a good practice to save the state of your run at various checkpoints and start a workload from the latest checkpoint (typically between epochs).</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#how-to-save-checkpoints","title":"How to Save Checkpoints","text":"<p>TensorFlow, PyTorch, and others have mechanisms to help save checkpoints (e.g. https://www.tensorflow.org/guide/checkpoint for TensorFlow and https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html for PyTorch).</p> <p>This document uses Keras as an example. The code itself can be found here</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#where-to-save-checkpoints","title":"Where to Save Checkpoints","text":"<p>It is important to save the checkpoints to network storage and not the machine itself. When your workload resumes, it can, in all probability, be allocated to a different node (machine) than the original node. Example:</p> <pre><code>runai submit train-with-checkpoints -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n</code></pre> <p>The command saves the checkpoints in an NFS checkpoints folder <code>/mnt/nfs_share/john</code></p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#when-to-save-checkpoints","title":"When to Save Checkpoints","text":""},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-periodically","title":"Save Periodically","text":"<p>It is a best practice to save checkpoints at intervals. For example, every epoch as the Keras code below shows:</p> <pre><code>checkpoints_file = \"weights.best.hdf5\"\ncheckpoint = ModelCheckpoint(checkpoints_file, monitor='val_acc', verbose=1, \nsave_best_only=True, mode='max')\n</code></pre>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#save-on-exit-signal","title":"Save on Exit Signal","text":"<p>If periodic checkpoints are not enough, you can use a signal-hook provided by Run:ai (via Kubernetes). The hook is python code that is called before your Job is suspended and allows you to save your checkpoints as well as other state data you may wish to store.</p> <pre><code>import signal\nimport time\ndef graceful_exit_handler(signum, frame):\n# save your checkpoints to shared storage\n# exit with status \"1\" is important for the Job to return later.  \nexit(1)\nsignal.signal(signal.SIGTERM, graceful_exit_handler)\n</code></pre> <p>By default, you will have 30 seconds to save your checkpoints.</p> <p>Important</p> <p>For the signal to be captured, it must be propagated from the startup script to the python child process. See code here</p>"},{"location":"Researcher/best-practices/save-dl-checkpoints/#resuming-using-saved-checkpoints","title":"Resuming using Saved Checkpoints","text":"<p>A Run:ai unattended workload that is resumed, will run the same startup script as on the first run. It is the responsibility of the script developer to add code that:</p> <ul> <li>Checks if saved checkpoints exist (see above)</li> <li>If saved checkpoints exist, load them and start the run using these checkpoints</li> </ul> <pre><code>import os\ncheckpoints_file = \"weights.best.hdf5\"\nif os.path.isfile(checkpoints_file):\nprint(\"loading checkpoint file: \" + checkpoints_file)\nmodel.load_weights(checkpoints_file)\n</code></pre>"},{"location":"Researcher/cli-reference/Introduction/","title":"Introduction","text":"<p>The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.</p> <p>To install and configure the Run:ai CLI see Researcher Setup - Start Here</p>"},{"location":"Researcher/cli-reference/runai-attach/","title":"runai attach","text":""},{"location":"Researcher/cli-reference/runai-attach/#description","title":"Description","text":"<p>Attach to a running Job.</p> <p>The command attaches to the standard input, output, and error streams of a running Job. If the Job has multiple pods the job will attach to the first pod unless otherwise set.</p>"},{"location":"Researcher/cli-reference/runai-attach/#synopsis","title":"Synopsis","text":"<pre><code>runai attach &lt;job-name&gt;\n    [--no-stdin ]\n    [--no-tty]   \n    [--pod string]\n    .\n    [--loglevel value] \n    [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-attach/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-attach/#-no-stdin","title":"--no-stdin","text":"<p>Do not attach STDIN.</p>"},{"location":"Researcher/cli-reference/runai-attach/#-no-tty","title":"--no-tty","text":"<p>Do not allocate a pseudo-TTY</p>"},{"location":"Researcher/cli-reference/runai-attach/#-pod-string","title":"--pod string","text":"<p>Attach to a specific pod within the Job. To find the list of pods run <code>runai describe job &lt;job-name&gt;</code> and then use the pod name with the <code>--pod</code> flag.</p>"},{"location":"Researcher/cli-reference/runai-attach/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-attach/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-attach/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-attach/#output","title":"Output","text":"<p>None</p>"},{"location":"Researcher/cli-reference/runai-bash/","title":"runai bash","text":""},{"location":"Researcher/cli-reference/runai-bash/#description","title":"Description","text":"<p>Get a bash session inside a running Job</p> <p>This command is a shortcut to runai exec (<code>runai exec -it job-name bash</code>). See runai exec for full documentation of the exec command.</p>"},{"location":"Researcher/cli-reference/runai-bash/#synopsis","title":"Synopsis","text":"<pre><code>runai bash &lt;job-name&gt; [--pod string]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-bash/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-bash/#-pod-string","title":"--pod string","text":"<p>Specify a pod of a running Job. To get a list of the pods of a specific Job, run <code>runai describe job &lt;job-name&gt;</code> command</p>"},{"location":"Researcher/cli-reference/runai-bash/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-bash/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>"},{"location":"Researcher/cli-reference/runai-bash/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-bash/#-help-h","title":"--help | -h","text":"<p>Show help text</p>"},{"location":"Researcher/cli-reference/runai-bash/#output","title":"Output","text":"<p>The command will access the container that should be currently running in the current cluster and attempt to create a command-line shell based on bash.</p> <p>The command will return an error if the container does not exist or has not been in a running state yet.</p>"},{"location":"Researcher/cli-reference/runai-bash/#see-also","title":"See also","text":"<p>Build Workloads. See Quickstart document: Launch Interactive Build Workloads.</p>"},{"location":"Researcher/cli-reference/runai-config/","title":"runai config","text":""},{"location":"Researcher/cli-reference/runai-config/#description","title":"Description","text":"<p>Set a default Project or Cluster</p>"},{"location":"Researcher/cli-reference/runai-config/#synopsis","title":"Synopsis","text":"<pre><code>runai  config project &lt;project-name&gt;\n    [--loglevel value] [--help | -h]\nrunai  config cluster &lt;cluster-name&gt;\n    [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-config/#options","title":"Options","text":"<p>&lt;project-name&gt;  - The name of the Project you want to set as default. Mandatory.</p> <p>&lt;cluster-name&gt; - The name of the cluster you want to set as the current cluster. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-config/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-config/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-config/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-config/#output","title":"Output","text":"<p>None</p>"},{"location":"Researcher/cli-reference/runai-delete/","title":"runai delete","text":""},{"location":"Researcher/cli-reference/runai-delete/#description","title":"Description","text":"<p>Delete a Workload and its associated Pods.</p> <p>Note that once you delete a Workload, its entire data will be gone:</p> <ul> <li>You will no longer be able to enter it via bash.</li> <li>You will no longer be able to access logs.</li> <li>Any data saved on the container and not stored in a shared location will be lost.</li> </ul>"},{"location":"Researcher/cli-reference/runai-delete/#synopsis","title":"Synopsis","text":"<pre><code>runai delete job &lt;job-name&gt; [--all | -A]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-delete/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-delete/#-all-a","title":"--all | -A","text":"<p>Delete all Workloads.</p>"},{"location":"Researcher/cli-reference/runai-delete/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-delete/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-delete/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-delete/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-delete/#output","title":"Output","text":"<ul> <li> <p>The Workload will be deleted and not available via the command runai list jobs.</p> </li> <li> <p>The Workloads will show as <code>deleted</code> from the Run:ai user interface Job list.</p> </li> </ul>"},{"location":"Researcher/cli-reference/runai-delete/#see-also","title":"See Also","text":"<ul> <li> <p>Build Workloads. See Quickstart document: Launch Interactive Build Workloads.</p> </li> <li> <p>Training Workloads. See Quickstart document:  Launch Unattended Training Workloads.</p> </li> </ul>"},{"location":"Researcher/cli-reference/runai-describe/","title":"runai describe","text":""},{"location":"Researcher/cli-reference/runai-describe/#description","title":"Description","text":"<p>Display details of a Workload or Node.</p>"},{"location":"Researcher/cli-reference/runai-describe/#synopsis","title":"Synopsis","text":"<pre><code>runai describe job &lt;job-name&gt; [--output value | -o value]  [--loglevel value] [--project string | -p string] [--help | -h]\n[--output string | -o string]  runai describe node [node-name] [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-describe/#options","title":"Options","text":"<ul> <li>&lt;job-name&gt; - The name of the Workload to run the command with. Mandatory.</li> <li>&lt;node-name&gt; - The name of the Node to run the command with. If a Node name is not specified, a description of all Nodes is shown.</li> </ul> <p>-o | --output</p> <p>Output format. One of: json|yaml|wide. Default is 'wide'</p>"},{"location":"Researcher/cli-reference/runai-describe/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-describe/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-describe/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project, use: <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-describe/#-help-h","title":"--help | -h","text":"<p>Show help text</p>"},{"location":"Researcher/cli-reference/runai-describe/#output","title":"Output","text":"<ul> <li>The <code>runai describe job</code> command will show Workload properties and status as well as lifecycle events and the list of related resources and pods.</li> <li>The <code>runai describe node</code> command will show Node properties. </li> </ul>"},{"location":"Researcher/cli-reference/runai-exec/","title":"runai exec","text":""},{"location":"Researcher/cli-reference/runai-exec/#description","title":"Description","text":"<p>Execute a command inside a running Job</p> <p>Note: to execute a bash command, you can also use the shorthand runai bash</p>"},{"location":"Researcher/cli-reference/runai-exec/#synopsis","title":"Synopsis","text":"<pre><code>runai exec &lt;job-name&gt; &lt;command&gt; [--stdin | -i] [--tty | -t]\n[--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-exec/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p> <p>&lt;command&gt; the command itself (e.g. bash).</p>"},{"location":"Researcher/cli-reference/runai-exec/#-stdin-i","title":"--stdin | -i","text":"<p>Keep STDIN open even if not attached.</p>"},{"location":"Researcher/cli-reference/runai-exec/#-tty-t","title":"--tty | -t","text":"<p>Allocate a pseudo-TTY.</p>"},{"location":"Researcher/cli-reference/runai-exec/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-exec/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-exec/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-exec/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-exec/#output","title":"Output","text":"<p>The command will run in the context of the container.</p>"},{"location":"Researcher/cli-reference/runai-exec/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-list/","title":"runai list","text":""},{"location":"Researcher/cli-reference/runai-list/#description","title":"Description","text":"<p>Show lists of Workloads, Projects, Clusters or Nodes.</p>"},{"location":"Researcher/cli-reference/runai-list/#synopsis","title":"Synopsis","text":"<pre><code>runai list jobs [--all-projects | -A]  [--loglevel value] [--project string | -p string] [--help | -h]\nrunai list projects [--loglevel value] [--help | -h]\nrunai list clusters  [--loglevel value] [--help | -h]\nrunai list nodes [node-name]\n[--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-list/#options","title":"Options","text":"<p><code>node-name</code> - Name of a specific node to list (optional).</p>"},{"location":"Researcher/cli-reference/runai-list/#-all-projects-a","title":"--all-projects | -A","text":"<p>Show Workloads from all Projects.</p>"},{"location":"Researcher/cli-reference/runai-list/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-list/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-list/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-list/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-list/#output","title":"Output","text":"<ul> <li>A list of Workloads, Nodes, Projects, or Clusters. </li> <li>To filter 'runai list nodes' for a specific Node, add the Node name.</li> </ul>"},{"location":"Researcher/cli-reference/runai-list/#see-also","title":"See Also","text":"<p>To show details for a specific Workload or Node see runai describe.</p>"},{"location":"Researcher/cli-reference/runai-login/","title":"runai login","text":""},{"location":"Researcher/cli-reference/runai-login/#description","title":"Description","text":"<p>Login to Run:ai</p> <p>When Researcher Authentication is enabled, you will need to login to Run:ai using your username and password before accessing resources </p>"},{"location":"Researcher/cli-reference/runai-login/#synopsis","title":"Synopsis","text":"<pre><code>runai login [--loglevel value]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-login/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-login/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-login/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-login/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-login/#output","title":"Output","text":"<p>You will be prompted for a user name and password</p>"},{"location":"Researcher/cli-reference/runai-login/#see-also","title":"See Also","text":"<ul> <li>runai logout.</li> </ul>"},{"location":"Researcher/cli-reference/runai-logout/","title":"runai logout","text":""},{"location":"Researcher/cli-reference/runai-logout/#description","title":"Description","text":"<p>Log out from Run:ai</p>"},{"location":"Researcher/cli-reference/runai-logout/#synopsis","title":"Synopsis","text":"<pre><code>runai logout [--loglevel value]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-logout/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-logout/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logout/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-logout/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-logout/#output","title":"Output","text":"<p>You will be logged out from Run:ai</p>"},{"location":"Researcher/cli-reference/runai-logout/#see-also","title":"See Also","text":"<ul> <li>runai login.</li> </ul>"},{"location":"Researcher/cli-reference/runai-logs/","title":"runai logs","text":""},{"location":"Researcher/cli-reference/runai-logs/#description","title":"Description","text":"<p>Show the logs of a Job.</p>"},{"location":"Researcher/cli-reference/runai-logs/#synopsis","title":"Synopsis","text":"<pre><code>runai logs &lt;job-name&gt; [--follow | -f] [--pod string | -p string] [--since duration] [--since-time date-time] [--tail int | -t int] [--timestamps]  [--loglevel value] [--project string | -p string] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-logs/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-follow-f","title":"--follow | -f","text":"<p>Stream the logs.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-pod-p","title":"--pod | -p","text":"<p>Specify a specific pod name. When a Job fails, it may start a couple of times in an attempt to succeed. The flag allows you to see the logs of a specific instance (called 'pod'). Get the name of the pod by running <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-instance-string-i-string","title":"--instance (string) | -i (string)","text":"<p>Show logs for a specific instance in cases where a Job contains multiple pods.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-since-duration","title":"--since (duration)","text":"<p>Return logs newer than a relative duration like 5s, 2m, or 3h. Defaults to all logs. The flags since and since-time cannot be used together.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-since-time-date-time","title":"--since-time (date-time)","text":"<p>Return logs after specified date. Date format should be RFC3339, example: <code>2020-01-26T15:00:00Z</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-tail-int-t-int","title":"--tail (int) | -t (int)","text":"<p># of lines of recent log file to display.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-timestamps","title":"--timestamps","text":"<p>Include timestamps on each line in the log output.</p>"},{"location":"Researcher/cli-reference/runai-logs/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-logs/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-logs/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the project to which the command applies. By default, commands apply to the default project. To change the default project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-logs/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-logs/#output","title":"Output","text":"<p>The command will show the logs of the first process in the container. For training Jobs, this would be the command run at startup. For interactive Jobs, the command may not show anything.</p>"},{"location":"Researcher/cli-reference/runai-logs/#see-also","title":"See Also","text":"<ul> <li>Training Workloads. See Quickstart document:  Launch Unattended Training Workloads.</li> </ul>"},{"location":"Researcher/cli-reference/runai-port-forwarding/","title":"runai port-forward","text":""},{"location":"Researcher/cli-reference/runai-port-forwarding/#description","title":"Description","text":"<p>Forward one or more local ports to the selected job or a pod within the job. The forwarding session ends when the selected job terminates or the terminal is interrupted.</p>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#examples","title":"Examples","text":"<ol> <li> <p>Port forward connections from localhost:8080 (localhost is the default) to  on port 8090. <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090</code></p> <li> <p>Port forward connections from 192.168.1.23:8080 to  on port 8080. <p><code>runai port-forward &lt;job-name&gt; --port 8080 --address 192.168.1.23</code></p> <li> <p>Port forward multiple connections from localhost:8080 to  on port 8090 and localhost:6443 to  on port 443. <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090  --port 6443:443</code></p> <li> <p>Port forward into a specific pod in a multi-pod job.</p> <p><code>runai port-forward &lt;job-name&gt; --port 8080:8090 --pod &lt;pod-name&gt;</code></p> </li>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#global-flags","title":"Global flags","text":"<p><code>--loglevel &lt;string&gt;</code>\u2014Set the logging level. Choose:  (default \"info\"). <p><code>-p | --project &lt;string&gt;</code>\u2014Specify the project name. To change the default project use <code>runai config project &lt;project name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-port-forwarding/#flags","title":"Flags","text":"<p><code>--address &lt;string&gt; | [local-interface-ip\\host] |localhost | 0.0.0.0 [privileged]</code>\u2014The listening address of your local machine. (default \"localhost\").</p> <p><code>-h | --help</code>\u2014Help for the command.</p> <p><code>--port</code>\u2014forward ports based on one of the following arguments:</p> <ul> <li> <p><code>&lt;stringArray&gt;</code>\u2014a list of port forwarding combinations.</p> </li> <li> <p><code>[local-port]:[remote-port]</code>\u2014different local and remote ports.</p> </li> <li> <p><code>[local-port=remote-port]</code>\u2014the same port is used for both local and remote.</p> </li> </ul> <p><code>--pod</code>\u2014Specify a pod of a running job. To get a list of the pods of a specific job, run the command <code>runai describe &lt;job-name&gt;</code>.</p> <p><code>--pod-running-timeout</code>\u2014The length of time (like 5s, 2m, or 3h, higher than zero) to wait until the pod is running. Default is 10 minutes.</p> <p>Filter based flags</p> <p><code>--mpi</code>\u2014search only for mpi jobs.</p> <p><code>--interactive</code>\u2014search only for interactive jobs.</p> <p><code>--pytorch</code>\u2014search only for pytorch jobs.</p> <p><code>--tf</code>\u2014search only for tensorflow jobs.</p> <p><code>--train</code>\u2014search only for training jobs.</p>"},{"location":"Researcher/cli-reference/runai-resume/","title":"runai resume","text":""},{"location":"Researcher/cli-reference/runai-resume/#description","title":"Description","text":"<p>Resume a suspended Job</p> <p>Resuming a previously suspended Job will return it to the queue for scheduling. The Job may or may not start immediately, depending on available resources. </p> <p>Suspend and resume do not work with mpi Jobs. </p>"},{"location":"Researcher/cli-reference/runai-resume/#synopsis","title":"Synopsis","text":"<pre><code>runai resume &lt;job-name&gt;\n    [--all | -A]\n[--loglevel value]\n[--project string | -p string]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-resume/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-resume/#-all-a","title":"--all | -A","text":"<p>Resume all suspended Jobs in the current Project.</p>"},{"location":"Researcher/cli-reference/runai-resume/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-resume/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-resume/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-resume/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-resume/#output","title":"Output","text":"<ul> <li>The Job will be resumed. When running runai list jobs the Job status will no longer by Suspended.</li> </ul>"},{"location":"Researcher/cli-reference/runai-resume/#see-also","title":"See Also","text":"<ul> <li>Suspending Jobs: Suspend.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/","title":"runai submit-dist tf","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#description","title":"Description","text":"<p> Version 2.10 and later.</p> <p>Submit a distributed TensorFlow training run:ai job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the &lt; insert  TensorFlow operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#examples","title":"Examples","text":"<pre><code>runai submit-dist tf --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name\n&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-ttl-after-finish-duration","title":"--ttl-after-finish &lt; duration &gt;  <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-TF/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/","title":"runai submit-dist mpi","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#description","title":"Description","text":"<p>Submit a Distributed Training (MPI) Run:ai Job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the Kubeflow MPI Operator as specified here</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#examples","title":"Examples","text":"<p>You can start an unattended mpi training Job of name dist1, based on Project team-a using a quickstart-distributed image:</p> <pre><code>runai submit-dist mpi --name dist1 --workers=2 -g 1 \\\n    -i gcr.io/run-ai-demo/quickstart-distributed:v0.3.0 -e RUNAI_SLEEP_SECS=60\n</code></pre> <p>(see: distributed training Quickstart).</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-workers-int","title":"--workers &lt; int &gt;","text":"<p>Number of replicas for Inference jobs.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-slots-per-worker-int","title":"--slots-per-worker &lt; int &gt;","text":"<p>Number of slots to allocate for each worker.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-ttl-after-finish-duration","title":"--ttl-after-finish &lt; duration &gt;  <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-mpi/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/","title":"runai submit-dist pytorch","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#description","title":"Description","text":"<p> Version 2.10 and later.</p> <p>Submit a distributed PyTorch training run:ai job to run.</p> <p>Note</p> <p>To use distributed training you need to have installed the &lt; insert pytorch operator here &gt; as specified &lt; insert pre-requisites link here &gt;.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#examples","title":"Examples","text":"<pre><code>runai submit-dist pytorch --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-max-replicas-int","title":"--max-replicas &lt; int &gt;","text":"<p>Maximum number of replicas for elastic PyTorch job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-min-replicas-int","title":"--min-replicas &lt; int &gt;","text":"<p>Minimum number of replicas for elastic PyTorch job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-ttl-after-finish-duration","title":"--ttl-after-finish &lt; duration &gt;  <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-pytorch/#see-also","title":"See Also","text":"<p>&lt; please let me know if this is needed, or if additional documentation is needed in the link &gt; *   See Quickstart document Running Distributed Training.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/","title":"runai submit-dist xgboost","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#description","title":"Description","text":"<p>Submit a distributed XGBoost training run:ai job to run.</p> <p>Syntax notes:</p> <ul> <li>Options with a value type of stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#examples","title":"Examples","text":"<pre><code>runai submit-dist xgboost --name distributed-job --workers=2 -g 1 \\\n    -i &lt;image_name\n&gt;\n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#distributed","title":"Distributed","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-clean-pod-policy-string","title":"--clean-pod-policy &lt; string &gt;","text":"<p>The\u00a0CleanPodPolicy\u00a0controls deletion of pods when a job terminates. The policy can be one of the following values:</p> <ul> <li>Running\u2014only pods still running when a job completes (for example, parameter servers) will be deleted immediately. Completed pods will not be deleted so that the logs will be preserved. (Default)</li> <li>All\u2014all (including completed) pods will be deleted immediately when the job finishes.</li> <li>None\u2014no pods will be deleted when the job completes.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-workers-int","title":"--workers &lt; int&gt;","text":"<p>Number of replicas for Inference jobs</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-ttl-after-finish-duration","title":"--ttl-after-finish &lt; duration &gt;  <p>The duration, after which a finished job is automatically deleted (e.g. 5s, 2m, 3h).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#output","title":"Output","text":"<p>The command will attempt to submit an mpi Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-submit-dist-xgboost/#see-also","title":"See Also","text":"<ul> <li>See Quickstart document Running Distributed Training.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit/","title":"runai submit","text":""},{"location":"Researcher/cli-reference/runai-submit/#description","title":"Description","text":"<p>Submit a Run:ai Job for execution.</p> <p>Syntax notes:</p> <ul> <li>Flags of type stringArray mean that you can add multiple values. You can either separate values with a comma or add the flag twice.</li> </ul>"},{"location":"Researcher/cli-reference/runai-submit/#examples","title":"Examples","text":"<p>All examples assume a Run:ai Project has been setup using <code>runai config project &lt;project-name&gt;</code>.</p> <p>Start an interactive Job:</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1\n</code></pre> <p>Or</p> <pre><code>runai submit --name build1 -i ubuntu -g 1 --interactive -- sleep infinity \n</code></pre> <p>(see: build Quickstart).</p> <p>Externalize ports:</p> <pre><code>runai submit --name build-remote -i rastasheep/ubuntu-sshd:14.04 --interactive \\\n   --service-type=nodeport --port 30022:22\n   -- /usr/sbin/sshd -D\n</code></pre> <p>(see: build with ports Quickstart).</p> <p>Start a Training Job</p> <pre><code>runai submit --name train1 -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre> <p>(see: training Quickstart).</p> <p>Use GPU Fractions</p> <pre><code>runai submit --name frac05 -i gcr.io/run-ai-demo/quickstart -g 0.5\n</code></pre> <p>(see: GPU fractions Quickstart).</p> <p>Hyperparameter Optimization</p> <pre><code>runai submit --name hpo1 -i gcr.io/run-ai-demo/quickstart-hpo -g 1  \\\n   --parallelism 3 --completions 12 -v /nfs/john/hpo:/hpo \n</code></pre> <p>(see: hyperparameter optimization Quickstart).</p> <p>Submit a Job without a name (automatically generates a name)</p> <pre><code>runai submit -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre> <p>Submit a job using the system autogenerated name to an external URL:</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745 --custom-url=&lt;destination_url&gt;\n</code></pre> <p>Submit a job without a name to a system generated a URL :</p> <pre><code>runai submit -i ubuntu --interactive --attach -g 1 service-type=external-url --port 3745\n</code></pre> <p>Submit a Job without a name with a pre-defined prefix and an incremental index suffix</p> <pre><code>runai submit --job-name-prefix -i gcr.io/run-ai-demo/quickstart -g 1 \n</code></pre>"},{"location":"Researcher/cli-reference/runai-submit/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-type","title":"Job Type","text":""},{"location":"Researcher/cli-reference/runai-submit/#-interactive","title":"--interactive","text":"<p>Mark this Job as interactive.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-jupyter","title":"--jupyter","text":"<p>Run a Jupyter notebook using a default image and notebook configuration.</p>"},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-completions-int","title":"--completions &lt; int &gt;","text":"<p>Number of successful pods required for this job to be completed. Used with HPO.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-parallelism-int","title":"--parallelism &lt; int &gt;","text":"<p>Number of pods to run in parallel at any given time.  Used with HPO.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-preemptible","title":"--preemptible","text":"<p>Interactive preemptible jobs can be scheduled above guaranteed quota but may be reclaimed at any time.</p>"},{"location":"Researcher/cli-reference/runai-submit/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"Researcher/cli-reference/runai-submit/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"Researcher/cli-reference/runai-submit/#container-definition","title":"Container Definition","text":""},{"location":"Researcher/cli-reference/runai-submit/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"Researcher/cli-reference/runai-submit/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"Researcher/cli-reference/runai-submit/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#resource-allocation","title":"Resource Allocation","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#job-lifecycle_1","title":"Job Lifecycle","text":""},{"location":"Researcher/cli-reference/runai-submit/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#storage","title":"Storage","text":""},{"location":"Researcher/cli-reference/runai-submit/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#network","title":"Network","text":""},{"location":"Researcher/cli-reference/runai-submit/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are:</p> <ul> <li><code>portforward</code> (deprecated)</li> <li><code>loadbalancer</code></li> <li><code>nodeport</code></li> <li><code>external-url</code></li> </ul>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-custom-url-string","title":"--custom-url <code>&lt;string&gt;</code>  <p>An optional argument that specifies a custom URL when using the <code>external URL</code> service type. If not provided, the system will generate a URL automatically.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#access-control","title":"Access Control","text":""},{"location":"Researcher/cli-reference/runai-submit/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#scheduling","title":"Scheduling","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"Researcher/cli-reference/runai-submit/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-submit/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""},{"location":"Researcher/cli-reference/runai-submit/#output","title":"Output","text":"<p>The command will attempt to submit a Job. You can follow up on the Job by running <code>runai list jobs</code> or <code>runai describe job &lt;job-name&gt;</code>.</p> <p>Note that the submit call may use a policy to provide defaults to any of the above flags.</p>"},{"location":"Researcher/cli-reference/runai-submit/#see-also","title":"See Also","text":"<ul> <li>See any of the Quickstart documents here:.</li> <li>See policy configuration for a description on how policies work.</li> </ul>"},{"location":"Researcher/cli-reference/runai-suspend/","title":"runai suspend","text":""},{"location":"Researcher/cli-reference/runai-suspend/#description","title":"Description","text":"<p>Suspend a Job</p> <p>Suspending a Running Job will stop the Job and will not allow it to be scheduled until it is resumed using <code>runai resume</code>. This means that,</p> <ul> <li>You will no longer be able to enter it via <code>runai bash</code>.</li> <li>The Job logs will be deleted.</li> <li>Any data saved on the container and not stored in a shared location will be lost.</li> </ul> <p>Technically, the command deletes the Kubernetes pods associated with the Job and marks the Job as suspended until it is manually released. </p> <p>Suspend and resume do not work with MPI and Inference </p>"},{"location":"Researcher/cli-reference/runai-suspend/#synopsis","title":"Synopsis","text":"<pre><code>runai suspend &lt;job-name&gt;\n    [--all | -A]\n[--loglevel value]\n[--project string | -p string]\n[--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-suspend/#options","title":"Options","text":"<p>&lt;job-name&gt; - The name of the Job to run the command with. Mandatory.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-all-a","title":"--all | -A","text":"<p>Suspend all Jobs in the current Project.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-suspend/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-project-p-string","title":"--project | -p (string)","text":"<p>Specify the Project to which the command applies. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-suspend/#output","title":"Output","text":"<ul> <li>The Job will be suspended. When running runai list jobs the Job will be marked as Suspended.</li> </ul>"},{"location":"Researcher/cli-reference/runai-suspend/#see-also","title":"See Also","text":"<ul> <li>Resuming Jobs: Resume.</li> </ul>"},{"location":"Researcher/cli-reference/runai-top-node/","title":"runai top node","text":""},{"location":"Researcher/cli-reference/runai-top-node/#description","title":"Description","text":"<p>Show list of Nodes (machines), their capacity and utilization.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#synopsis","title":"Synopsis","text":"<pre><code>runai top node \n    [--help | -h]\n    [--details | -d]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-top-node/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-top-node/#global-flags","title":"Global Flags","text":""},{"location":"Researcher/cli-reference/runai-top-node/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-top-node/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#-details-d","title":"--details | -d","text":"<p>Show additional details.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#output","title":"Output","text":"<p>Shows a list of Nodes their capacity and utilization.</p>"},{"location":"Researcher/cli-reference/runai-top-node/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-update/","title":"runai update","text":""},{"location":"Researcher/cli-reference/runai-update/#description","title":"Description","text":"<p>Find and install the latest version of the runai command-line utility. The command must be run with sudo permissions.</p> <pre><code>sudo runai update\n</code></pre>"},{"location":"Researcher/cli-reference/runai-update/#synopsis","title":"Synopsis","text":"<pre><code>runai update [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-update/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-update/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-update/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-update/#output","title":"Output","text":"<p>Update of the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-update/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-version/","title":"runai version","text":""},{"location":"Researcher/cli-reference/runai-version/#description","title":"Description","text":"<p>Show the version of this utility.</p>"},{"location":"Researcher/cli-reference/runai-version/#synopsis","title":"Synopsis","text":"<pre><code>runai version [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-version/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-version/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-version/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-version/#output","title":"Output","text":"<p>The version of the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-version/#see-also","title":"See Also","text":""},{"location":"Researcher/cli-reference/runai-whoami/","title":"runai whoami","text":""},{"location":"Researcher/cli-reference/runai-whoami/#description","title":"Description","text":"<p>Show the user name currently logged in</p>"},{"location":"Researcher/cli-reference/runai-whoami/#synopsis","title":"Synopsis","text":"<pre><code>runai whoami [--loglevel value] [--help | -h]\n</code></pre>"},{"location":"Researcher/cli-reference/runai-whoami/#options","title":"Options","text":""},{"location":"Researcher/cli-reference/runai-whoami/#-loglevel-string","title":"--loglevel (string)","text":"<p>Set the logging level. One of: debug | info | warn | error (default \"info\").</p>"},{"location":"Researcher/cli-reference/runai-whoami/#-help-h","title":"--help | -h","text":"<p>Show help text.</p>"},{"location":"Researcher/cli-reference/runai-whoami/#output","title":"Output","text":"<p>The name of the User currently logged in with the Run:ai command-line interface.</p>"},{"location":"Researcher/cli-reference/runai-whoami/#see-also","title":"See Also","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/","title":"Allocation of CPU and Memory","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#introduction","title":"Introduction","text":"<p>When we discuss the allocation of deep learning compute resources, the discussion tends to focus on GPUs as the most critical resource. But two additional resources are no less important:</p> <ul> <li>CPUs. Mostly needed for preprocessing and postprocessing tasks during a deep learning training run.</li> <li>Memory. Has a direct influence on the quantities of data a training run can process in batches.</li> </ul> <p>GPU servers tend to come installed with a significant amount of memory and CPUs.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#requesting-cpu-memory","title":"Requesting CPU &amp; Memory","text":"<p>When submitting a Job, you can request a guaranteed amount of CPUs and memory by using the --cpu and --memory flags in the runai submit command. For example:</p> <pre><code>runai submit job1 -i ubuntu --gpu 2 --cpu 12 --memory 1G\n</code></pre> <p>The system guarantees that if the Job is scheduled, you will be able to receive this amount of CPU and memory.</p> <p>For further details on these flags see: runai submit</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-over-allocation","title":"CPU over allocation","text":"<p>The number of CPUs your Job will receive is guaranteed to be the number defined using the <code>--cpu</code> flag. In practice, however, you may receive more CPUs than you have asked for:</p> <ul> <li>If you are currently alone on a node, you will receive all the node CPUs until such time when another workload has joined.</li> <li>However, when a second workload joins, each workload will receive a number of CPUs proportional to the number requested via the <code>--cpu</code> flag. For example, if the first workload asked for 1 CPU and the second for 3 CPUs, then on a node with 40 cpus, the workloads will receive 10 and 30 CPUs respectively. If the flag <code>--cpu</code> is not specified, it will be taken from the cluster default (see the section below)</li> </ul>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#memory-over-allocation","title":"Memory over allocation","text":"<p>The amount of Memory your Job will receive is guaranteed to be the number defined using the --memory flag. In practice, however, you may receive more memory than you have asked for. This is along the same lines as described with CPU over allocation above.</p> <p>It is important to note, however, that if you have used this memory over-allocation, and new workloads have joined, your Job may receive an out-of-memory exception and terminate.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#cpu-and-memory-limits","title":"CPU and Memory limits","text":"<p>You can limit your Job's allocation of CPU and memory by using the --cpu-limit and --memory-limit flags in the runai submit command. For example:</p> <pre><code>runai submit job1 -i ubuntu --gpu 2 --cpu 12 --cpu-limit 24 \\\n    --memory 1G --memory-limit 4G\n</code></pre> <p>The limit behavior is different for CPUs and memory.</p> <ul> <li>Your Job will never be allocated with more than the amount stated in the <code>--cpu-limit</code> flag</li> <li>If your Job tries to allocate more than the amount stated in the <code>--memory-limit</code> flag it will receive an out-of-memory exception.</li> </ul> <p>The limit (for both CPU and memory) overrides the cluster default described in the section below</p> <p>For further details on these flags see: runai submit</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#flag-defaults","title":"Flag Defaults","text":""},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-flag","title":"Defaults for --cpu flag","text":"<p>If your Job has not specified <code>--cpu</code>, the system will use a default. The default is cluster-wide and is defined as a ratio of GPUs to CPUs.</p> <p>If, for example, the default has been defined as 1:6 and your Job has specified <code>--gpu 2</code> and has not specified <code>--cpu</code>, then the implied <code>--cpu</code> flag value is 12 CPUs.</p> <p>The system comes with a cluster-wide default of 1:1. To change the ratio see below.</p> <p>If you didn't request any GPUs for your job and has not specified <code>--cpu</code>, the default is defined as a ratio of CPU limit to CPUs.</p> <p>If, for example, the default has been defined as 1:0.2 and your Job has specified <code>--cpu-limit 10</code> and has not specified <code>--cpu</code>, then the implied <code>--cpu</code> flag value is 2 CPUs.</p> <p>The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-flag","title":"Defaults for --memory flag","text":"<p>If your Job has not specified <code>--memory</code>, the system will use a default. The default is cluster-wide and is proportional to the number of requested GPUs.</p> <p>The system comes with a cluster-wide default of 100MiB of allocated CPU memory per GPU. To change the ratio see below.</p> <p>If you didn't request any GPUs for your job and has not specified <code>--memory</code>, the default is defined as a ratio of CPU Memory limit to CPU Memory Request.</p> <p>The system comes with a cluster-wide default of 1:0.1. To change the ratio see below.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-cpu-limit-flag","title":"Defaults for --cpu-limit flag","text":"<p>If your Job has not specified <code>--cpu-limit</code>, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to CPUs. See below on how to change the ratio.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#defaults-for-memory-limit-flag","title":"Defaults for --memory-limit flag","text":"<p>If your Job has not specified <code>--memory-limit</code>, then by default, the system will not set a limit. You can set a cluster-wide limit as a ratio of GPUs to Memory. See below on how to change the ratio.</p>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#changing-the-ratios","title":"Changing the ratios","text":"<p>To change the cluster wide-ratio use the following process. The example shows: </p> <ul> <li>a CPU request with a default ratio of 2:1 CPUs to GPUs.</li> <li>a CPU Memory request with a default ratio of 200MB per GPU.</li> <li>a CPU limit with a default ratio of 4:1 CPU to GPU.</li> <li>a Memory limit with a default ratio of 2GB per GPU.</li> <li>a CPU request with a default ratio of 0.1 CPUs per 1 CPU limit.</li> <li>a CPU Memory request with a default ratio of 0.1:1 request per CPU Memory limit.</li> </ul> <p>You must edit the cluster installation values file:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>You must specify at least the first 4 values as follows: </li> </ul> <pre><code>runai-operator:\nconfig:\nlimitRange:\ncpuDefaultRequestGpuFactor: 2\nmemoryDefaultRequestGpuFactor: 200Mi\ncpuDefaultLimitGpuFactor: 4\nmemoryDefaultLimitGpuFactor: 2Gi\ncpuDefaultRequestCpuLimitFactorNoGpu: 0.1\nmemoryDefaultRequestMemoryLimitFactorNoGpu: 0.1\n</code></pre>"},{"location":"Researcher/scheduling/allocation-of-cpu-and-memory/#validating-cpu-memory-allocations","title":"Validating CPU &amp; Memory Allocations","text":"<p>To review CPU &amp; Memory allocations you need to look into Kubernetes. A Run:ai Job creates a Kubernetes pod. The pod declares its resource requests and limits. To see the memory and CPU consumption in Kubernetes:</p> <ul> <li>Get the pod name for the Job by running: <pre><code>runai describe job &lt;JOB_NAME&gt;\n</code></pre> </li> </ul> <p>the pod will appear under the <code>PODS</code> category. </p> <ul> <li>Run:<pre><code>kubectl describe pod &lt;POD_NAME&gt;\n</code></pre> </li> </ul> <p>The information will appear under <code>Requests</code> and <code>Limits</code>. For example:</p> <pre><code>Limits:\nnvidia.com/gpu:  2\nRequests:\ncpu:             1\nmemory:          104857600\nnvidia.com/gpu:  2\n</code></pre>"},{"location":"Researcher/scheduling/fractions/","title":"Allocation of GPU Fractions","text":""},{"location":"Researcher/scheduling/fractions/#introduction","title":"Introduction","text":"<p>A single GPU has a significant amount of memory. Ranging from a couple of gigabytes in older generations and up to 80GB per GPU in the later models of the latest NVIDIA GPU technology. A single GPU also has a vast amount of computing power. </p> <p>This amount of memory and computing power is important for processing large amounts of data, such as in training deep learning models. However, there are quite a few applications that do not need this power. Examples can be inference workloads and the model-creation phase. It would thus be convenient if we could divide up a GPU between various workloads, thus achieving better GPU utilization. </p> <p>This article describes two complementary technologies that allow the division of GPUs and how to use them with Run:ai.</p> <ol> <li>Run:ai Fractions. </li> <li>Dynamic allocation using NVIDIA Multi-instance GPU (MIG)</li> </ol>"},{"location":"Researcher/scheduling/fractions/#runai-fractions","title":"Run:ai Fractions","text":"<p>Run:ai provides the capability to allocate a container with a specific amount of GPU RAM. As a researcher, if you know that your code needs 4GB of RAM. You can submit a job using the flag <code>--gpu-memory 4G</code> to specify the exact portion of the GPU memory that you need. Run:ai will allocate your container that specific amount of GPU RAM. Attempting to reach beyond your allotted RAM will result in an out-of-memory exception. </p> <p>You can also use the flag <code>--gpu 0.2</code> to get 20% of the GPU memory on the GPU assigned for you. </p> <p>For more details on Run:ai fractions see the fractions quickstart.</p> <p>Limitation</p> <p>With the fraction technology all running workloads, which utilize the GPU, share the compute in parallel and on average get an even share of the compute. For example, assuming two containers, one with 0.25 GPU workload and the other with 0.75 GPU workload - both will get (on average) an equal part of the computation power. If one of the workloads does not utilize the GPU, the other workload will get the entire GPU's compute power.</p> <p>Info</p> <p>For interoperability with other Kubernetes schedulers, Run:ai creates special reservation pods. Once a workload has been allocated a fraction of a GPU, Run:ai will create a pod in a dedicated <code>runai-reservation</code> namespace with the full GPU as a resource. This would cause other schedulers to understand that the GPU is reserved.    </p>"},{"location":"Researcher/scheduling/fractions/#dynamic-mig","title":"Dynamic MIG","text":"<p>NVIDIA MIG allows GPUs based on the NVIDIA Ampere architecture (such as NVIDIA A100) to be partitioned into separate GPU Instances:</p> <ul> <li>When divided, the portion acts as a fully independent GPU.</li> <li>The division is static, in the sense that you have to call NVIDIA API or the <code>nvidia-smi</code> command to create or remove the MIG partition. </li> <li>The division is both of compute and memory.</li> <li>The division has fixed sizes.  Up to 7 units of compute and memory in fixed sizes. The various MIG profiles can be found in the NVIDIA documentation. A typical profile can be <code>MIG 2g.10gb</code> which provides 2/7 of the compute power and 10GB of RAM</li> <li>Reconfiguration of MIG profiles on the GPU requires administrator permissions and the draining of all running workloads. </li> </ul> <p>Run:ai provides a way to dynamically create a MIG partition:</p> <ul> <li>Using the same experience as the Fractions technology above, if you know that your code needs 4GB of RAM. You can use the flag <code>--gpu-memory 4G</code> to specify the portion of the GPU memory that you need. Run:ai will call the NVIDIA MIG API to generate the smallest possible MIG profile for your request, and allocate it to your container. </li> <li>MIG is configured on the fly according to workload demand, without needing to drain workloads or to involve an IT administrator.</li> <li>Run:ai will automatically deallocate the partition when the workload finishes. This happens in a lazy fashion in the sense that the partition will not be removed until the scheduler decides that it is needed elsewhere. </li> <li>Run:ai provides an additional flag to dynamically create the specific MIG partition in NVIDIA terminology. As such, you can specify <code>--mig-profile 2g.10gb</code>.  </li> <li>In a single GPU cluster you have some MIG nodes that are dynamically allocated and some that are not.</li> </ul> <p>For more details on Run:ai fractions see the dynamic MIG quickstart.</p>"},{"location":"Researcher/scheduling/fractions/#setting-up-dynamic-mig","title":"Setting up Dynamic MIG","text":"<p>As described above, MIG is only available in the latest NVIDIA architecture. </p> <ul> <li>When working with Kubernetes, NVIDIA defines a concept called MIG Strategy. With Run:ai you must set the MIG strategy to <code>mixed</code>. See NVIDIA prerequisites on how to set this flag. </li> <li> <p>The administrator needs to specifically enable dynamic MIG on the node by running: </p> <p><pre><code>runai-adm set node-role --dynamic-mig-enabled &lt;node-name&gt;\n</code></pre> (use <code>runai-adm remove</code> to unset)</p> </li> <li> <p>Make sure that MIG is enabled on the node level by running <code>nvidia-smi</code> on the node and verifying that MIG Mode is enabled (see highlight below):</p> </li> </ul> <pre><code>+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|                               |                      |               MIG M. |\n|===============================+======================+======================|\n|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                   On |\n| N/A   32C    P0    42W / 400W |      0MiB / 40536MiB |     N/A      Default |\n|                               |                      |              Enabled |\n+-------------------------------+----------------------+----------------------+\n</code></pre> <ul> <li> <p>To enable MIG Mode see NVIDIA documentation.</p> </li> <li> <p>Set:     <pre><code>kubectl label node &lt;node-name&gt; node-role.kubernetes.io/runai-mig-enabled=true\n</code></pre>    (use <code>kubectl</code> to unset)</p> </li> </ul> <p>Limitations</p> <ul> <li>Once a node has been marked as dynamic MIG enabled, it can only be used via the Run:ai scheduler.</li> <li>Run:ai currently supports H100 or A100 nodes with 40GB/80GB RAM.</li> <li>GPU utilization, shown on the Run:ai dashboards, may not be accurate while MIG jobs are running.</li> </ul>"},{"location":"Researcher/scheduling/fractions/#mixing-fractions-and-dynamic-mig","title":"Mixing Fractions and Dynamic MIG","text":"<p>Given a specific node, the IT administrator can decide whether to use one technology or the other. When the Researcher asks for a specific amount of GPU memory, Run:ai will either provide it on an annotated node by dynamically allocating a MIG partition, or use a different node using the fractions technology.</p>"},{"location":"Researcher/scheduling/fractions/#see-also","title":"See Also","text":"<ul> <li>Fractions quickstart.</li> <li>Dynamic MIG quickstart</li> </ul>"},{"location":"Researcher/scheduling/hpo/","title":"Researcher Library: Hyperparameter Optimization Support","text":"<p>The Run:ai Researcher Library is a python library you can add to your deep learning python code. The hyperparameter optimization(HPO) support module of the library is a helper library for hyperparameter optimization (HPO) experiments</p> <p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process. Example hyperparameters: Learning rate, Batch size, Different optimizers, number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine the results to decide what works best.</p> <p>With the reporter module, you can externalize information such as progress, accuracy, and loss over time/epoch, and more. In addition, you can externalize custom metrics of your choosing.</p>"},{"location":"Researcher/scheduling/hpo/#getting-started","title":"Getting Started","text":""},{"location":"Researcher/scheduling/hpo/#prerequisites","title":"Prerequisites","text":"<p>Run:ai HPO library is dependent on PyYAML. Install it using the command:</p> <pre><code>pip install pyyaml\n</code></pre>"},{"location":"Researcher/scheduling/hpo/#installing","title":"Installing","text":"<p>Install the <code>runai</code> Python library using <code>pip</code> using the following command:</p> <pre><code>pip install runai\n</code></pre> <p>Make sure to use the correct <code>pip</code> installer (you might need to use <code>pip3</code> for Python3)</p>"},{"location":"Researcher/scheduling/hpo/#usage","title":"Usage","text":"<ul> <li>Import the <code>runai.hpo</code> package.</li> </ul> <pre><code>import runai.hpo\n</code></pre> <ul> <li>Initialize the Run:ai HPO library with a path to a directory shared between all cluster nodes (typically using an NFS server). We recommend specifying a unique name for the experiment, the name will be used to create a sub-directory on the shared folder. To do so, we recommend using the environment variables <code>JOB_NAME</code> and <code>JOB_UUID</code> which are injected to the container by Run:ai.</li> </ul> <pre><code>hpo_root = '/path/to/nfs'\nhpo_experiment = '%s_%s' % (os.getenv('JOB_NAME'), os.getenv('JOB_UUID'))\nrunai.hpo.init(hpo_root, hpo_experiment)\n</code></pre> <ul> <li>Decide on an HPO strategy:<ul> <li>Random search - randomly pick a set of hyperparameter values</li> <li>Grid search - pick the next set of hyperparameter values, iterating through all sets across multiple experiments</li> </ul> </li> </ul> <pre><code>strategy = runai.hpo.Strategy.GridSearch\n</code></pre> <ul> <li>Call the Run:ai HPO library to specify a set of hyperparameters and pick a specific configuration for this experiment.</li> </ul> <pre><code>config = runai.hpo.pick(\ngrid=dict(\nbatch_size=[32, 64, 128],\nlr=[1, 0.1, 0.01, 0.001]),\nstrategy=strategy)\n</code></pre> <ul> <li>Use the returned configuration in your code. For example:</li> </ul> <pre><code>optimizer = keras.optimizers.SGD(lr=config['lr'])\n</code></pre> <p>Metrics could be reported and saved in the experiment directory under the fule <code>runai.yaml</code> using <code>runai.hpo.report</code>. You should pass the epoch number and a dictionary with metrics to be reported. For example:</p> <pre><code>runai.hpo.report(epoch=5, metrics={ 'accuracy': 0.87 })\n</code></pre>"},{"location":"Researcher/scheduling/hpo/#see-also","title":"See Also","text":"<ul> <li>See hyperparameter Optimization Quickstart</li> <li>Sample code in Github</li> </ul>"},{"location":"Researcher/scheduling/job-statuses/","title":"Job Statuses","text":""},{"location":"Researcher/scheduling/job-statuses/#introduction","title":"Introduction","text":"<p>The runai submit function and its sibling the runai submit-dist mpi function submit Run:ai Jobs for execution.</p> <p>A Job has a status. Once a Job is submitted it goes through several statuses before ending in an End State. Most of these statuses originate in the underlying Kubernetes infrastructure, but some are Run:ai-specific. </p> <p>The purpose of this document is to explain these statuses as well as the lifecycle of a Job. </p>"},{"location":"Researcher/scheduling/job-statuses/#successful-flow","title":"Successful Flow","text":"<p>A regular, training Job that has no errors and executes without preemption would go through the following statuses:</p> <p></p> <ul> <li>Pending - the Job is waiting to be scheduled.</li> <li>ContainerCreating - the Job has been scheduled, the Job docker image is now downloading.</li> <li>Running - the Job is now executing.</li> <li>Succeeded - the Job has finished with exit code 0 (success).</li> </ul> <p>The Job can be preempted, in which case it can go through other statuses:</p> <ul> <li>Terminating - the Job is now being preempted.</li> <li>Pending - the Job is waiting in queue again to receive resources.</li> </ul> <p>An interactive Job, by definition, needs to be closed by the Researcher and will thus never reach the Succeeded status. Rather, it would be moved by the Researcher to status Deleted.</p> <p>For a further explanation of the additional statuses, see the table below.</p>"},{"location":"Researcher/scheduling/job-statuses/#error-flow","title":"Error flow","text":"<p>A regular, training Job may encounter an error inside the running process (exit code is non-zero). In which case the following will happen:</p> <p></p> <p>The Job enters an Error status and then immediately tries to reschedule itself for another attempted run. The reschedule can happen on another node in the system. After a specified number of retries, the Job will enter a final status of Fail</p> <p>An interactive Job, enters an Error status and then moves immediately to CrashLoopBackOff trying to reschedule itself. The reschedule attempt has no 'back-off' limit and will continue to retry indefinitely </p> <p></p> <p>Jobs may be submitted with an image that cannot be downloaded. There are special statuses for such Jobs. See table below </p>"},{"location":"Researcher/scheduling/job-statuses/#status-table","title":"Status Table","text":"<p>Below is a list of statuses. For each status the list shows:</p> <ul> <li> <p>Name</p> </li> <li> <p>End State - this status is the final status in the lifecycle of the Job</p> </li> <li> <p>Resource Allocation - when the Job is in this status, does the system allocate resources to it</p> </li> <li> <p>Description</p> </li> <li> <p>Color - Status color as can be seen in the Run:ai User Interface Job list</p> </li> </ul> <p>Status</p> <p>End State</p> <p>Resource Allocation</p> <p>Description</p> <p>Color</p> <p>Running</p> <p></p> <p>Yes</p> <p>Job is running successfully</p> <p></p> <p>Terminating</p> <p></p> <p>Yes</p> <p>Pod is being evicted at the moment (e.g. due to an over-quota allocation, the reason will be written once eviction finishes). A new pod will be created shortly</p> <p></p> <p>ContainerCreating</p> <p></p> <p>Yes</p> <p>Image is being pulled from registry.</p> <p></p> <p>Pending</p> <p></p> <p>-</p> <p>Job is pending. Possible reasons:</p> <p>- Not enough resources</p> <p>- Waiting in Queue (over-quota etc).</p> <p></p> <p>Succeeded</p> <p>Yes</p> <p>-</p> <p>An Unattended (training) Job has ran and finished successfully.</p> <p></p> <p>Deleted</p> <p>Yes</p> <p>-</p> <p>Job has been deleted.</p> <p></p> <p>TimedOut</p> <p>Yes</p> <p>-</p> <p>Interactive Job has reached the defined timeout of the project.</p> <p></p> <p>Preempted</p> <p>Yes</p> <p>-</p> <p>Interactive preemptible Job has been evicted.</p> <p></p> <p>ContainerCannotRun</p> <p>Yes</p> <p>-</p> <p>Container has failed to start running. This is typically a problem within the docker image itself.</p> <p></p> <p>Error</p> <p></p> <p>Yes for interactive only </p> <p>The Job has returned an exit code different than zero. It is now waiting for another run attempt (retry).</p> <p></p> <p>Fail</p> <p>Yes</p> <p>-</p> <p>Job has failed after a number of retries (according to \"--backoffLimit\" field) and will not be trying again.</p> <p></p> <p>CrashLoopBackOff</p> <p></p> <p>Yes</p> <p>Interactive Only: During backoff after Error, before a retry attempt to run pod on the same node.</p> <p></p> <p>ErrImagePull, ImagePullBackOff</p> <p></p> <p>Yes</p> <p>Failing to retrieve docker image</p> <p></p> <p>Unknown</p> <p>Yes</p> <p>-</p> <p>The Run:ai Scheduler wasn't running when the Job has finished.</p> <p></p>"},{"location":"Researcher/scheduling/job-statuses/#how-to-get-more-information","title":"How to get more information","text":"<p>The system stores various events during the Job's lifecycle. These events can be helpful in diagnosing issues around Job scheduling. To view these events run:</p> <pre><code>runai describe job &lt;workload-name&gt;\n</code></pre> <p>Sometimes, useful information can be found by looking at  logs emitted from the process running inside the container. For example, Jobs that have exited with an exit code different than zero may write an exit reason in this log. To see Job logs run:</p> <pre><code>runai logs &lt;job-name&gt;\n</code></pre>"},{"location":"Researcher/scheduling/job-statuses/#distributed-training-mpi-jobs","title":"Distributed Training (mpi) Jobs","text":"<p>A distributed (mpi) Job, which has no errors will be slightly more complicated and has additional statuses associated with it. </p> <ul> <li> <p>Distributed Jobs start with an \"init container\" which sets the stage for a distributed run.</p> </li> <li> <p>When the init container finishes, the main \"launcher\" container is created. The launcher is responsible for coordinating between the different workers</p> </li> <li> <p>Workers run and do the actual work.</p> </li> </ul> <p>A successful flow of distribute training would look as:</p> <p></p> <p>Additional Statuses:</p> <p>Status</p> <p>End State</p> <p>Resource Allocation</p> <p>Description</p> <p>Color</p> <p>Init:&lt;number A&gt;/&lt;number B&gt;</p> <p></p> <p>Yes</p> <p>The Pod has B Init Containers, and A have completed so far.</p> <p></p> <p>PodInitializing</p> <p></p> <p>Yes</p> <p>The pod has finished executing Init Containers. The system is creating the main 'launcher' container</p> <p></p> <p>Init:Error</p> <p></p> <p></p> <p>An Init Container has failed to execute.</p> <p></p> <p>Init:CrashLoopBackOff</p> <p></p> <p></p> <p>An Init Container has failed repeatedly to execute</p> <p></p>"},{"location":"Researcher/scheduling/schedule-to-aws-groups/","title":"Scheduling workloads to AWS placement groups","text":"<p>Run:ai supports AWS placement groups when building and submitting a job. AWS Placement Groups are used to maximize throughput and performance of distributed training workloads.</p> <p>To enable and configure this feature:</p> <ol> <li>Press <code>Jobs | New job</code>.</li> <li>In <code>Scheduling and lifecycle</code> enable the <code>Topology aware scheduling</code>.</li> <li>In <code>Topology key</code>, enter the label of the topology of the node.</li> <li> <p>In <code>Scheduling rule</code> choose <code>Required</code> or <code>Preferred</code> from the drop down.</p> <ul> <li><code>Required</code>\u2014when enabled, all PODs must be scheduled to the same placement group.</li> <li><code>Preferred</code>\u2014when enabled, this is a best-effort, to place as many PODs on the same placement group.</li> </ul> </li> </ol>"},{"location":"Researcher/scheduling/strategies/","title":"Introduction","text":"<p>When the Run:ai scheduler schedules Jobs, it can use two alternate placement strategies:</p> Strategy Description Bin Packing Fill up a GPU or CPU and/or a node before moving on to the next one Spreading Equally spread Jobs amongst GPUs, CPUs and nodes"},{"location":"Researcher/scheduling/strategies/#bin-packing","title":"Bin Packing","text":"<p>Bin packing is the default strategy. With bin packing, the scheduler tries to:</p> <ul> <li>Fill up a node (GPUSs or CPUs) with Jobs before allocating Jobs to second and third nodes.</li> <li>In a multi GPU node, when using fractions, fill up a GPU before allocating Jobs to a second GPU.</li> </ul> <p>The advantage of this strategy is that the scheduler, over time, can package more Jobs into the cluster. As the strategy minimizes fragmentation.</p> <p>In a GPU node, for example, if we have 2 GPUs in a single node on the cluster, and 2 tasks requiring 0.5 GPUs each, using bin-packing, we would place both Jobs on the same GPU and remain with a full GPU ready for the next Job.</p> <p>In a CPU node, for example, if we have 4 CPUs in a single node on the cluster, and 2 tasks requiring 1 CPU each, using bin-packing, we would place both Jobs on the same node and still have more capacity for the next Job.</p>"},{"location":"Researcher/scheduling/strategies/#spreading","title":"Spreading","text":"<p>There are disadvantages to bin-packing:</p> <ul> <li>Within a single GPU, two fractional Jobs compete for the same onboard compute power.</li> <li>Within a single node, two Jobs (even on separate GPUs) compete for networking resources, compute power and memory.</li> </ul> <p>When there are more resources available than requested, it sometimes makes sense to spread Jobs amongst nodes and GPUs, to allow higher utilization of computing resources and network resources.</p> <p>Returning to the example above, if we have 2 GPUs in a single node on the cluster, and 2 Jobs requiring 0.5 GPUs each, using spread scheduling we would place each Job on a separate GPU, allowing both to benefit from the computing power of a full GPU.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/","title":"The Run:ai Scheduler","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#introduction","title":"Introduction","text":"<p>At the heart of the Run:ai solution is the Run:ai scheduler. The scheduler is the gatekeeper of your organization's hardware resources. It makes decisions on resource allocations according to pre-created rules.</p> <p>The purpose of this document is to describe the Run:ai scheduler and explain how resource management works.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#terminology","title":"Terminology","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#workload-types","title":"Workload Types","text":"<p>Run:ai differentiates between three types of deep learning workloads:</p> <ul> <li>Interactive build workloads. With these types of workloads, the data scientist opens an interactive session, via bash, Jupyter notebook, remote PyCharm, or similar and accesses GPU resources directly. Build workloads typically do not tax the GPU for a long duration. There are also typically real users behind an interactive workload that need an immediate scheduling response.</li> <li> <p>Unattended (or \"non-interactive\") training workloads. Training is characterized by a deep learning run that has a start and a finish. With these types of workloads, the data scientist prepares a self-running workload and sends it for execution. Training workloads typically utilize large percentages of the GPU. During the execution, the Researcher can examine the results. A Training session can take anything from a few minutes to a couple of weeks. It can be interrupted in the middle and later restored.  It follows that a good practice for the Researcher is to save checkpoints and allow the code to restore from the last checkpoint.</p> </li> <li> <p>Inference workloads. These are production workloads that serve requests. The Run:ai scheduler treats these workloads as Interactive workloads.</p> </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#projects","title":"Projects","text":"<p>Projects are quota entities that associate a Project name with a deserved GPU quota as well as other preferences.</p> <p>A Researcher submitting a workload must associate a Project with any workload request. The Run:ai scheduler will then compare the request against the current allocations and the Project's deserved quota and determine whether the workload can be allocated with resources or whether it should remain in a pending state.</p> <p>For further information on Projects and how to configure them, see: Working with Projects</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#departments","title":"Departments","text":"<p>A Department is the second hierarchy of resource allocation above Project. A Department quota supersedes a Project quota in the sense that if the sum of Project quotas for Department A exceeds the Department quota -- the scheduler will use the Department quota rather than the Projects' quota.  </p> <p>For further information on Departments and how to configure them, see: Working with Departments</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#pods","title":"Pods","text":"<p>Pods are units of work within a Job. </p> <ul> <li>Typically, each Job has a single Pod. However, in some scenarios (see Hyperparameter Optimization and Distribute Training below) there will be multiple Pods per Job. </li> <li>All Pods execute with the same arguments as added via <code>runai submit</code>. E.g. The same image name, the same code script, the same number of Allocated GPUs, memory.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#basic-scheduling-concepts","title":"Basic Scheduling Concepts","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#interactive-training-and-inference","title":"Interactive, Training and Inference","text":"<p>The Researcher uses the --interactive flag to specify whether the workload is an unattended \"train\" workload or an interactive \"build\" workload.</p> <ul> <li>Interactive &amp; Inference workloads will get precedence over training workloads.</li> <li>Training workloads can be preempted when the scheduler determines a more urgent need for resources. Interactive workloads are never preempted.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#guaranteed-quota-and-over-quota","title":"Guaranteed Quota and Over-Quota","text":"<p>There are two use cases for Quota and Over-Quota:</p> <p>Node pools are disabled</p> <p>Every new workload is associated with a Project. The Project contains a deserved GPU quota. During scheduling:</p> <ul> <li>If the newly required resources, together with currently used resources, end up within the Project's quota, then the workload is ready to be scheduled as part of the guaranteed quota.</li> <li>If the newly required resources together with currently used resources end up above the Project's quota, the workload will only be scheduled if there are 'spare' GPU resources. There are nuances in this flow that are meant to ensure that a Project does not end up with an over-quota made fully of interactive workloads. For additional details see below.</li> </ul> <p>Node pools are enabled</p> <p>Every new workload is associated with a Project. The Project contains a deserved GPU quota that is the sum off all node pools GPU quotas. During scheduling:</p> <ul> <li>If the newly required resources, together with currently used resources, end up within the overall Project's quota and the requested node pool(s) quota, then the workload is ready to be scheduled as part of the guaranteed quota.</li> <li>If the newly required resources together with currently used resources end up above the Project's quota or the requested node pool(s) quota, the workload will only be scheduled if there are 'spare' GPU resources within the same node pool but not part of this Project. There are nuances in this flow that are meant to ensure that a Project does not end up with an over-quota made entirely of interactive workloads. For additional details see below.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#quota-with-multiple-resources","title":"Quota with Multiple Resources","text":"<p>A project may have a quota set for more than one resource (GPU, CPU or CPU Memory). For a project to be \"Over-quota\" it will have to have at least one resource over its quota. For a project to be under-quota it needs to have all of its resources under-quota.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#scheduler-details","title":"Scheduler Details","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#allocation-preemption","title":"Allocation &amp; Preemption","text":"<p>The Run:ai scheduler wakes up periodically to perform allocation tasks on pending workloads:</p> <ul> <li>The scheduler looks at each Project separately and selects the most 'deprived' Project.</li> <li> <p>For this deprived Project it chooses a single workload to work on:</p> <ul> <li>Interactive &amp; Inference workloads are tried first, but only up to the Project's guaranteed quota. If such a workload exists, it is scheduled even if it means preempting a running unattended workload in this Project.</li> <li>Else, it looks for an unattended workload and schedules it on guaranteed quota or over-quota.</li> </ul> </li> <li> <p>The scheduler then recalculates the next 'deprived' Project and continues with the same flow until it finishes attempting to schedule all workloads</p> </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#node-pools","title":"Node Pools","text":"<p>A Node Pool is a set of nodes grouped by an Administrator into a distinct group of resources from which resources can be allocated to Projects and Departments. By default, any node pool created in the system is automatically associated with all Projects and Departments using zero quota resource (GPUs, CPUs, Memory) allocation. This allows any Project and Department to use any node pool with Over-Quota (for Preemptible workloads), thus maximizing the system resource utilization.</p> <ul> <li>An Administrator can allocate resources from a specific node pool to chosen Projects and Departments. See Project Setup</li> <li>The Researcher can use node pools in two ways. The first one is where a Project has guaranteed resources on node pools - The Researcher can then submit a workload and specify a single node pool or a prioritized list of node pools to use and receive guaranteed resources.  The second is by using node-pool(s) with no guaranteed resource for that Project (zero allocated resources), and in practice using Over-Quota resources of node-pools. This means a Workload must be Preemptible as it uses resources out of the Project or node pool quota. The same scenario occurs if a Researcher uses more resources than allocated to a specific node pool and goes Over-Quota.</li> <li>By default, if a Researcher doesn't specify a node-pool to use by a workload, the scheduler assigns the workload to run using the Project's 'Default node-pool list'.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#node-affinity","title":"Node Affinity","text":"<p>Both the Administrator and the Researcher can provide limitations as to which nodes can be selected for the Job. Limits are managed via Kubernetes labels:</p> <ul> <li>The Administrator can set limits at the Project level. Example: Project <code>team-a</code> can only run <code>interactive</code> Jobs on machines with a label of <code>v-100</code> or <code>a-100</code>. See Project Setup for more information.</li> <li>The Researcher can set a limit at the Job level, by using the command-line interface flag <code>--node-type</code>. The flag acts as a subset to the Project setting. </li> </ul> <p>Node affinity constraints are used during the Allocation phase to filter out candidate nodes for running the Job. For more information on how nodes are filtered see the <code>Filtering</code> section under Node selection in kube-scheduler. The Run:ai scheduler works similarly.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#reclaim","title":"Reclaim","text":"<p>During the above process, there may be a pending workload whose Project is below the deserved capacity. Still, it cannot be allocated due to the lack of GPU resources. The scheduler will then look for alternative allocations at the expense of another Project which has gone over-quota while preserving fairness between Projects.</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#fairness","title":"Fairness","text":"<p>The Run:ai scheduler determines fairness between multiple over-quota Projects according to their GPU quota. Consider for example two Projects, each spawning a significant amount of workloads (e.g. for Hyperparameter tuning) all of which wait in the queue to be executed. The Run:ai Scheduler allocates resources while preserving fairness between the different Projects regardless of the time they entered the system. The fairness works according to the relative portion of the GPU quota for each Project. To further illustrate that, suppose that:</p> <ul> <li>Project A has been allocated a quota of 3 GPUs.</li> <li>Project B has been allocated a quota of 1 GPU.</li> </ul> <p>Then, if both Projects go over-quota, Project A will receive 75% (=3/(1+3)) of the idle GPUs and Project B will receive 25% (=1/(1+3)) of the idle GPUs. This ratio will be recalculated every time a new Job is submitted to the system or an existing Job ends.</p> <p>This fairness equivalence will also be maintained amongst running Jobs. The scheduler will preempt training sessions to maintain this equivalence </p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#over-quota-priority","title":"Over-Quota Priority","text":"<p>When the Over-quota Priority feature is enabled, The Run:ai scheduler allocates GPUs within-quota and over-quota using different weights. Within quota, GPUs are allocated based on assigned GPUs. The remaining over-quota GPUs are allocated based on their relative portion of GPU Over-quota Priority for each Project.  GPUs Over-Quota Priority values are translated into numeric values as follows: None-0, Low-1, Medium-2, High-3.</p> <p>Let's examine the previous example with Over-Quota Weights:</p> <ul> <li>Project A has been allocated with a quota of 3 GPUs and GPU over-quota weight is set to Low.</li> <li>Project B has been allocated with a quota of 1 GPU and GPU over-quota weight is set to High.</li> </ul> <p>Then, Project A is allocated with 3 GPUs and project B is allocated with 1 GPU. If both Projects go over-quota, Project A will receive an additional 25% (=1/(1+3)) of the idle GPUs and Project B will receive an additional 75% (=3/(1+3)) of the idle GPUs.</p> <p>With the addition of node pools, the principles of Over-Quota and Over-Quota priority remain unchanged. However, the number of resources that are allocated with Over-Quota and Over-Quota Priority is calculated against node pool resources instead of the whole Project resources.</p> <ul> <li>Note: Over-Quota On/Off and Over-Quota Priority settings remain at the Project and Department level.  </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#bin-packing-consolidation","title":"Bin-packing &amp; Consolidation","text":"<p>Part of an efficient scheduler is the ability to eliminate fragmentation:</p> <ul> <li>The first step in avoiding fragmentation is bin packing: try and fill nodes (machines) up before allocating workloads to new machines.</li> <li>The next step is to consolidate Jobs on demand. If a workload cannot be allocated due to fragmentation, the scheduler will try and move unattended workloads from node to node in order to get the required amount of GPUs to schedule the pending workload.</li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#advanced","title":"Advanced","text":""},{"location":"Researcher/scheduling/the-runai-scheduler/#gpu-fractions","title":"GPU Fractions","text":"<p>Run:ai provides a Fractional GPU sharing system for containerized workloads on Kubernetes. The system supports workloads running CUDA programs and is especially suited for lightweight AI tasks such as inference and model building. The fractional GPU system transparently gives data science and AI engineering teams the ability to run multiple workloads simultaneously on a single GPU.</p> <p>Run:ai\u2019s fractional GPU system effectively creates logical GPUs, with their own memory and computing space that containers can use and access as if they were self-contained processors. </p> <p>One important thing to note is that fraction scheduling divides up GPU memory. As such the GPU memory is divided up between Jobs. If a Job asks for 0.5 GPU, and the GPU has 32GB of memory, then the Job will see only 16GB. An attempt to allocate more than 16GB will result in an out-of-memory exception.</p> <p>GPU Fractions are scheduled as regular GPUs in the sense that:</p> <ul> <li>Allocation is made using fractions such that the total of the GPU allocation for a single GPU is smaller or equal to 1.</li> <li>Preemption is available for non-interactive workloads.  </li> <li>Bin-packing &amp; Consolidation work the same for fractions.</li> </ul> <p>Support: </p> <ul> <li>Hyperparameter Optimization supports fractions. </li> </ul>"},{"location":"Researcher/scheduling/the-runai-scheduler/#distributed-training","title":"Distributed Training","text":"<p>Distributed Training, is the ability to split the training of a model among multiple processors. It is often a necessity when multi-GPU training no longer applies; typically when you require more GPUs than exist on a single node. Each such split is a pod (see definition above). Run:ai spawns an additional launcher process that manages and coordinates the other worker pods.</p> <p>Distribute Training utilizes a practice sometimes known as Gang Scheduling:</p> <ul> <li>The scheduler must ensure that multiple pods are started on what are typically multiple Nodes before the Job can start. </li> <li>If one pod is preempted, the others are also immediately preempted.</li> <li>When node pools are enabled, all pods must be scheduled to the same node pool.</li> </ul> <p>Gang Scheduling essentially prevents scenarios where part of the pods are scheduled while other pods belonging to the same Job are pending for resources to become available; scenarios that can cause deadlock situations and major inefficiencies in cluster utilization. </p> <p>The Run:ai system provides:</p> <ul> <li>Inter-pod communication. </li> <li>Command-line interface to access logs and an interactive shell. </li> </ul> <p>For more information on Distributed Training in Run:ai see here</p>"},{"location":"Researcher/scheduling/the-runai-scheduler/#hyperparameter-optimization","title":"Hyperparameter Optimization","text":"<p>Hyperparameter optimization (HPO) is the process of choosing a set of optimal hyperparameters for a learning algorithm. A hyperparameter is a parameter whose value is used to control the learning process, to define the model architecture or the data pre-processing process, etc. Example hyperparameters: learning rate, batch size, different optimizers, and the number of layers.</p> <p>To search for good hyperparameters, Researchers typically start a series of small runs with different hyperparameter values, let them run for a while, and then examine the results to decide what works best.</p> <p>With HPO, the Researcher provides a single script that is used with multiple, varying, parameters. Each run is a pod (see definition above). Unlike Gang Scheduling, with HPO, pods are independent. They are scheduled independently, started, and end independently, and if preempted, the other pods are unaffected. The scheduling behavior for individual pods is exactly as described in the Scheduler Details section above for Jobs.  In case node pools are enabled, if the HPO workload has been assigned with more than one node pool, the different pods might end up running on different node pools. </p> <p>For more information on Hyperparameter Optimization in Run:ai see here</p>"},{"location":"Researcher/scheduling/using-node-pools/","title":"Introduction","text":"<p> Version 2.8 and up.</p> <p>Node pools assist in managing heterogeneous resources effectively. A node pool is a set of nodes grouped into a bucket of resources using a predefined (e.g. GPU-Type) or administrator-defined label (key &amp; value). Typically, those nodes share a common feature or property, such as GPU type or other HW capability (such as Infiniband connectivity) or represent a proximity group (i.e. nodes interconnected via a local ultra-fast switch). Those nodes would typically be used by researchers to run specific workloads on specific resource types, or by MLops engineers to run specific Inference workloads that require specific node types.</p>"},{"location":"Researcher/scheduling/using-node-pools/#enabling-node-pools","title":"Enabling Node-Pools","text":"<p>The \u2018Node Pools\u2019 feature is disabled by default:</p> <ul> <li>To use node pools - enable this feature under <code>Settings</code> | <code>General</code>. Turn on <code>Enable Node Pools</code>.</li> <li>To manage CPU resources - enable this feature under  <code>Settings</code> | <code>General</code>. Turn on <code>Enable CPU Resources Quota</code>.</li> </ul> <p>Once the feature is enabled by the administrator, all nodes in each of your upgraded clusters are associated with the <code>Default</code> node pool.</p>"},{"location":"Researcher/scheduling/using-node-pools/#creating-and-using-node-pools","title":"Creating and using Node-Pools","text":"<p>An administrator creates logical groups of nodes by specifying a unique label (key &amp; value) and associating it with a node pool. Run:ai allows an administrator to use any label key and value as the designated node-pool label (e.g. <code>gpu-type = A100</code> or <code>faculty = computer-science</code>). Each node pool has a unique name and label used to identify and group nodes into a node pool. Once a new node pool is created, it is automatically assigned to all Projects and Departments with a quota of zero GPU resources and CPU resources. This allows any Project and Department to use any node pool when over-quota is enabled, even if the administrator has not assigned a quota for a specific node pool in a Project or Department.</p> <p>Using resources with over-quota means these resources might be reclaimed by other Projects or Departments that have an assigned quota in place for those node pools. On the other hand, this pattern allows for maximizing the utilization of GPU and CPU resources by the system. An administrator should assign resources from a node pool to a project for which the administrator wants to guarantee reserved resources on that node pool. The reservation should be done for GPU resources and CPU resources. Projects and Departments with no reserved resources for a specific node pool can still use node pool resources, but the resources are not reserved and can be reclaimed by the resources owner Project (or Department).</p> <p>Creating a new node pool and assigning resources from a node pool to Projects and Departments is an operation limited to Administrators only. Researchers can use node pools when submitting a new workload. By specifying the node pool from which a workload allocates resources, the scheduler shell launch that workload on a node that is part of the specified node pool. If no node-pool is selected by a workload, the \u2018Default\u2019 node-pool is used.</p>"},{"location":"Researcher/scheduling/using-node-pools/#creating-new-node-pools","title":"Creating new node pools","text":"<p>To create a node pool:</p> <ol> <li>From the left menu select Nodes then Node Pools.</li> <li>Press New Nodepool</li> <li>Enter a name, label, and value for the node pool.</li> <li>Select a GPU or CPU placement strategy. Press Save when complete.</li> </ol> <p>To assign nodes to a node pool:</p> <ol> <li>Get the list of nodes and their current labels using the following command:</li> </ol> <pre><code>kubectl get nodes --show-labels\n</code></pre> <ol> <li>Annotate a specific node with a new label using the following command:</li> </ol> <pre><code>kubectl label node &lt;node-name&gt; &lt;key&gt;=&lt;value&gt;\n</code></pre> <p>Note</p> <ul> <li>You can annotate multiple nodes with the same label.</li> </ul> <p>To create a node pool with the chosen common label use the create node pool Run:ai API.</p>"},{"location":"Researcher/scheduling/using-node-pools/#multiple-node-pools-selection","title":"Multiple Node Pools Selection","text":"<p> Version 2.9 and up</p> <p>Starting version 2.9, Run:ai system supports scheduling workloads to a node pool using a list of prioritized node pools. The scheduler will try to schedule the workload to the most prioritized node pool first, if it fails, it will try the second one and so forth. If the scheduler tried the entire list and failed to schedule the workload, it will start from the most prioritized node pool again. This pattern allows for maximizing the odds that a workload will be scheduled.</p>"},{"location":"Researcher/scheduling/using-node-pools/#defining-project-level-default-node-pool-priority-list","title":"Defining Project level 'default node pool priority list'","text":"<p>If the Researcher did not specify any node pool within the workload specification, the system will use the default node pool priority list as defined by the administrator. If the administrator did not define the *default node pool priority list_, the system will use the <code>Default</code> node pool.</p>"},{"location":"Researcher/scheduling/using-node-pools/#node-pools-best-practices","title":"Node-Pools Best Practices","text":"<p>Node pools give administrators the ability to manage quotas in a more granular manner than the Project level, allowing them to specify which Projects are assigned guaranteed resources on specific sets of nodes to be then used by Workloads that need specific node characteristics. Any Project can use any node pool, even if a quota was not assigned to the Node-Pool, it can still be used in an Over-Quota manner.</p> <p>As a rule of thumb, it is best for the administrator to split the organization's GPU deployment to the smallest number of node pools that still serves its purpose, this would help in keeping each pool large enough and minimize the probability that the Run:ai scheduler would not be able to find available resources on a specific node-pool.</p> <p>It is a good practice for researchers to use multiple node pools where applicable, to maximize their workload odds to get scheduled promptly or in cases where resources are scarce in a specific node pool.</p> <p>Administrators should set Projects' default node pool priority list' to make sure that in case a workload was scheduled with no node pool selection, it is scheduled to the preferences of the Administrator, and to increase the workload's odds to get scheduled and promptly.</p>"},{"location":"Researcher/scheduling/using-node-pools/#common-use-cases","title":"Common use-cases","text":"<ul> <li>Training workloads that require specific GPU-type nodes, either because of the scale of parameters (computation time) or for other specific GPU capabilities</li> <li>Inference workloads that require specific GPU-type nodes to comply with constraints such as execution time</li> <li>Workloads that require proximity of nodes for purposes of local ultra-fast networking</li> <li>Organizations where specific nodes belong to specific a  department, and while assuring quota for that department and its subordinated projects, the administrator also wants to let other departments and projects use those nodes when not used by the resource owner</li> <li>Projects that need to use specific resources, but also ensure others will not occupy those resources</li> </ul> <p>While the upper use cases refer to a single node pool, they are also applicable to multiple node pools. In cases where a workload's specification could be satisfied by more than one type of node, using multiple node pool selection potentially increases the probability of a workload finding resources to allocate and shortening the time it will take to get those resources.</p>"},{"location":"Researcher/tools/dev-jupyter/","title":"Use a Jupyter Notebook with a Run:ai Job","text":"<p>A Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code. Uses include data cleaning and transformation, numerical simulation, statistical modeling, data visualization, machine learning, and much more. Jupyter Notebooks are popular with Researchers as a way to code and run deep-learning code. A Jupyter Notebook runs inside the user container. </p> <p>This document is about accessing the remote container created by Run:ai via such a notebook. Alternatively, Run:ai provides integration with JupyterHub. JupyterHub is a separate service that makes it possible to serve pre-configured data science environments. For more information see Connecting JupyterHub with Run:ai.</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-a-jupyter-notebook-workload","title":"Submit a Jupyter Notebook Workload","text":"<p>There are two ways to submit a Jupyter Notebook Job: via the Command-line interface or the user interface</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-via-the-user-interface","title":"Submit via the User interface","text":"<ul> <li>Within the user interface go to the Job list.</li> <li>Select <code>New Job</code> on the top right.</li> <li>Select <code>Interactive</code> at the top. </li> <li>Add an image that supports Jupyter Notebook. For example <code>jupyter/scipy-notebook</code>.</li> <li>Select the <code>Jupyter Notebook</code> button.</li> </ul> <p>Submit the Job. When running, select the job and press <code>Connect</code> on the top right.</p>"},{"location":"Researcher/tools/dev-jupyter/#submit-a-workload","title":"Submit a Workload","text":"<p>Run the following command to connect to the Jupyter Notebook container as if it were running locally:</p> <pre><code>runai submit build-jupyter --jupyter -g 1\n</code></pre> <p>The terminal will show the following: </p> <pre><code>~&gt; runai submit build-jupyter --jupyter -g 1 --attach\nINFO[0001] Exposing default jupyter notebook port 8888\nINFO[0001] Using default jupyter notebook image \"jupyter/scipy-notebook\"\nINFO[0001] Using default jupyter notebook service type portforward\nThe job 'build-jupyter' has been submitted successfully\nYou can run `runai describe job build-jupyter -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0081] Job started\nJupyter notebook token: 428dc561a5431bd383eff17714460de478d673deec57c045\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -&gt; 8888\nForwarding from [::1]:8888 -&gt; 8888\n</code></pre> <ul> <li>The Job starts a Jupyter notebook container.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 8888</li> </ul> <p>Browse to http://localhost:8888. Use the token in the output to log into the notebook. </p>"},{"location":"Researcher/tools/dev-jupyter/#alternatives","title":"Alternatives","text":"<p>The above flag <code>--jupyter</code> is a shortcut with a predefined image. If you want to run your own notebook, use the quickstart on running a build workload with connected ports. </p>"},{"location":"Researcher/tools/dev-pycharm/","title":"Use PyCharm with a Run:ai Job","text":"<p>Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook</p> <p>This document is about accessing the remote container created by Run:ai, from JetBrain's PyCharm. </p>"},{"location":"Researcher/tools/dev-pycharm/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/pycharm-demo</code>. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection: </p> <pre><code>The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul> <p>Note<p>It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The Job redirects the external port 30022 to port 22 and uses a Node Port service type.</li> <li> <p>Run: <code>runai list worklaods</code></p> </li> <li> <p>Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222 </p> </li> </ul> </p>"},{"location":"Researcher/tools/dev-pycharm/#pycharm","title":"PyCharm","text":"<ul> <li>Under PyCharm | Preferences go to: Project | Python Interpreter </li> <li>Add a new SSH Interpreter. </li> <li>As Host, use the IP address above. Change the port to the above and use the Username <code>root</code></li> <li>You will be prompted for a password. Enter <code>root</code></li> <li>Apply settings and run the code via this interpreter. You will see your project uploaded to the container and running remotely. </li> </ul>"},{"location":"Researcher/tools/dev-tensorboard/","title":"Connecting to TensorBoard","text":"<p>Once you launch a Deep Learning workload using Run:ai, you may want to view its progress. A popular tool for viewing progress is TensorBoard.</p> <p>The document below explains how to use TensorBoard to view the progress or a Run:ai Job.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-a-workload","title":"Submit a Workload","text":"<p>When you submit a workload, your workload must save TensorBoard logs which can later be viewed. Follow this document on how to do this. You can also view the Run:ai sample code here.</p> <p>The code shows:</p> <ul> <li>A reference to a log directory:</li> </ul> <pre><code>log_dir = \"logs/fit/\" + datetime.datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n</code></pre> <ul> <li>A registered Keras callback for TensorBoard:</li> </ul> <pre><code>tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)\nmodel.fit(x_train, y_train,\n....\ncallbacks=[..., tensorboard_callback])\n</code></pre> <p>The <code>logs</code> directory must be saved on a Network File Server such that it can be accessed by the TensorBoard Job. For example, by running the Job as follows:</p> <pre><code>runai submit train-with-logs -i tensorflow/tensorflow:1.14.0-gpu-py3 \\\n  -v /mnt/nfs_share/john:/mydir -g 1  --working-dir /mydir --command -- ./startup.sh\n</code></pre> <p>Note the volume flag (<code>-v</code>) and working directory flag (<code>--working-dir</code>). The logs directory will be created on <code>/mnt/nfs_share/john/logs/fit</code>.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-a-tensorboard-workload","title":"Submit a TensorBoard Workload","text":"<p>There are two ways to submit a TensorBoard Workload: via the Command-line interface or the user interface</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-via-the-user-interface","title":"Submit via the User interface","text":"<ul> <li>Within the user interface go to the Job list.</li> <li>Select <code>New Job</code> on the top right.</li> <li>Select <code>Interactive</code> at the top. </li> <li>Add an image that supports TensorBoard. For example: <code>tensorflow/tensorflow:latest</code>.</li> <li>Select the <code>TensorBoard</code> button.</li> <li>Add a mounted volume on which TensorBoard logs exist. The example above uses <code>/mnt/nfs_share/john</code>. Map to <code>/mydir</code></li> <li>Add <code>/mydir</code> to the <code>TensorBoard Logs Directory</code>. </li> </ul> <p>Submit the Job. When running, select the job and press <code>Connect</code> on the top right.</p>"},{"location":"Researcher/tools/dev-tensorboard/#submit-via-the-command-line-interface","title":"Submit via the Command-line interface","text":"<p>Run the following:</p> <pre><code>runai submit tb -i tensorflow/tensorflow:latest --interactive --service-type=portforward --port 8888:8888  --working-dir /mydir  -v /mnt/nfs_share/john:/mydir  -- tensorboard --logdir logs/fit --port 8888 --host 0.0.0.0\n</code></pre> <p>The terminal will show the following: </p> <pre><code>The job 'tb' has been submitted successfully\nYou can run `runai describe job tb -p team-a` to check the job status\nINFO[0006] Waiting for job to start\nWaiting for job to start\nINFO[0014] Job started\nOpen access point(s) to service from localhost:8888\nForwarding from 127.0.0.1:8888 -&gt; 8888\nForwarding from [::1]:8888 -&gt; 8888\n</code></pre> <p>Browse to http://localhost:8888/ to view TensorBoard.</p> <p>Note</p> <p>A single TensorBoard Job can be used to view multiple deep learning Jobs, provided it has access to the logs directory for these Jobs. </p> <p>You can also submit a TensorBoard Job via the user interface. In which case, instead of <code>portforward</code> you will need to select a different service type. If the URL to the TensorBoard job includes a path, you may need to use the TensorBoard flag <code>--path_prefix</code>. For example, if your access point is acme.com/tensorboard1 add  <code>--path_prefix /tensorboard1</code>.</p>"},{"location":"Researcher/tools/dev-vscode/","title":"Use Visual Studio Code with a Run:ai Job","text":"<p>Once you launch a workload using Run:ai, you will want to connect to it. You can do so via command-line or via other tools such as a Jupyter Notebook</p> <p>This document is about accessing the remote container created by Run:ai, from Visual Studio Code. </p>"},{"location":"Researcher/tools/dev-vscode/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/pycharm-demo</code>. The image runs both python and ssh. Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection: </p> <pre><code>The job 'build-remote' has been submitted successfully\nYou can run `runai describe job build-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul> <p>Note<p>It is possible to connect to the container using a remote IP address. However, this would be less convinient as you will need to maintain port numbers manually and change them when remote accessing using the development tool. As an example, run:</p> <pre><code>runai submit build-remote -i gcr.io/run-ai-demo/pycharm-demo -g 1 --interactive --service-type=nodeport --port 30022:22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The Job redirects the external port 30022 to port 22 and uses a Node Port service type.</li> <li> <p>Run: <code>runai list jobs</code></p> </li> <li> <p>Next to the Job, under the \"Service URL\" column you will find the IP address and port. The port is 30222 </p> </li> </ul> </p>"},{"location":"Researcher/tools/dev-vscode/#visual-studio-code","title":"Visual Studio Code","text":"<ul> <li>Under Visual Studio code install the Remote SSH extension.</li> <li>Create an ssh entry to the service by editing .ssh/config file or use the command Remote-SSH: Connect to Host... from the Command Palette.  Enter the IP address and port from above (e.g. ssh root@35.34.212.12 -p 30022 or ssh root@127.0.0.1 -p 2222). User and password are <code>root</code> </li> <li>Using VS Code, install the Python extension on the remote machine </li> <li>Write your first python code and run it remotely.</li> </ul>"},{"location":"Researcher/tools/dev-x11forward-pycharm/","title":"Use PyCharm with X11 Forwarding and Run:ai","text":"<p>X11 is a window system for the Unix operating systems. X11 forwarding allows executing a program remotely through an SSH connection. Meaning, the executable file itself is hosted on a different machine than where the graphical interface is being displayed. The graphical windows are forwarded to your local machine through the SSH connection.</p> <p>This section is about setting up X11 forwarding from a Run:ai-based container to a PyCharm IDE on a remote machine.</p>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#submit-a-workload","title":"Submit a Workload","text":"<p>You will need your image to run an SSH server  (e.g OpenSSH). For the purposes of this document, we have created an image named <code>gcr.io/run-ai-demo/quickstart-x-forwarding</code>. The image runs:</p> <ul> <li>Python</li> <li>SSH Daemon configured for X11Forwarding </li> <li>OpenCV python library for image handling</li> </ul> <p>Details on how to create the image are here. The image is configured to use the <code>root</code> user and password for SSH.</p> <p>Run the following command to connect to the container as if it were running locally:</p> <pre><code>runai submit xforward-remote -i gcr.io/run-ai-demo/quickstart-x-forwarding --interactive  \\\n        --service-type=portforward --port 2222:22\n</code></pre> <p>The terminal will show the connection:</p> <pre><code>The job 'xforward-remote' has been submitted successfully\nYou can run `runai describe job xforward-remote -p team-a` to check the job status\nINFO[0007] Waiting for job to start\nWaiting for job to start\nWaiting for job to start\nWaiting for job to start\nINFO[0045] Job started\nOpen access point(s) to service from localhost:2222\nForwarding from [::1]:2222 -&gt; 22\n</code></pre> <ul> <li>The Job starts an sshd server on port 22.</li> <li>The connection is redirected to the local machine (127.0.0.1) on port 2222</li> </ul>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#setup-the-x11-forwarding-tunnel","title":"Setup the X11 Forwarding Tunnel","text":"<p>Connect to the new Job by running:</p> <pre><code>ssh -X root@127.0.0.1 -p 2222\n</code></pre> <p>Note the <code>-X</code> flag. </p> <p>Run:</p> <p><pre><code>echo $DISPLAY\n</code></pre> Copy the value. It will be used as a PyCharm environment variable.</p> <p>Important</p> <p>The ssh terminal should remain active throughout the session.</p>"},{"location":"Researcher/tools/dev-x11forward-pycharm/#pycharm","title":"PyCharm","text":"<ul> <li>Under PyCharm | Preferences go to: Project | Python Interpreter</li> <li>Add a new SSH Interpreter.</li> <li>As Host, use <code>localhost</code>. Change the port to the above (<code>2222</code>) and use the Username <code>root</code>.</li> <li>You will be prompted for a password. Enter <code>root</code>.</li> <li>Make sure to set the correct path of the Python binary. In our case it's <code>/usr/local/bin/python</code>.</li> <li> <p>Apply your settings.</p> </li> <li> <p>Under PyCharm configuration set the following environment variables:</p> <ol> <li><code>DISPLAY</code> - set environment variable you copied before</li> <li><code>HOME</code> - In our case it's <code>/root</code>. This is required for the X11 authentication to work.</li> </ol> </li> </ul> <p>Run your code. You can use our sample code here.</p>"},{"location":"Researcher/user-interface/trainings/","title":"Trainings","text":"<p>The Trainings interface provides a wizard to make submitting jobs easy.</p>"},{"location":"Researcher/user-interface/trainings/#prerequisites","title":"Prerequisites","text":"<p>You must have:</p> <ul> <li>Workspaces enabled.</li> <li>At least one Project configured.</li> </ul> <p>Note</p> <p>See your system administrator to ensure the prerequisites are enabled and configured.</p>"},{"location":"Researcher/user-interface/trainings/#adding-trainings","title":"Adding Trainings","text":"<p>Note</p> <p>Where there is a card gallery, use the search bar to find specific cards based on title or field values.</p> <p>To add a training:</p> <ol> <li>Press Tranings in the menu.</li> <li>In the Projects pane, select the destination project. Use the search box to find projects that are not listed. If you can't find the project, see your system administrator.</li> <li>In the Templates pane, select a template from the list. Use the search box to find templates that are not listed. If you can't find the specific template you need, see your system administrator.</li> <li>In the Training name pane, enter a name for the Traninng, then press continue.</li> <li>In the Environment pane select or create a new environment. Use the search box to find environments that are not listed.</li> <li>In the Compute resource pane, select resources for your tranings or create a new compute resource. Use the search box to find resources that are not listed. Press More settings to use Node Affinity to limit the resources to a specific node.</li> <li>In the Data sources pane, press add a new data source. For more information, see Creating a new data source When complete press, Create Data Source.</li> <li>When complete, press Create training.</li> </ol>"},{"location":"Researcher/user-interface/trainings/#managing-trainings","title":"Managing Trainings","text":"<p>The Trainings list contains a list of training jobs that you have created or have access to.</p> <p>To manage your trainings:</p> <ol> <li>Press the 1. Press Tranings in the menu.</li> <li>Select a Training from the list.</li> <li>Choose from the following actions:<ul> <li>Activate\u2014activates the selected training job.</li> <li>Stop\u2014stops the selected training job.</li> <li>Connect\u2014connects to the training job's configured environment.</li> <li>Copy &amp; edit\u2014copies the details of the selected training job to a new training job.</li> <li>Delete\u2014deletes the current training session.</li> <li>Show details\u2014displays details about the training job.</li> </ul> </li> </ol>"},{"location":"Researcher/user-interface/trainings/#training-details","title":"Training details","text":"<p>Training details are displayed using the Show details action. The details available per training job include;</p> <ul> <li>Event hostory\u2014a graph of the job's status over time along with a list of events found in the log.</li> <li> <p>Metrics\u2014a graph of available metrics for the job. Use the drop down select a date and a time slice. Metrics include:</p> <ul> <li>GPU utilization</li> <li>GPU memory useage</li> <li>CPU useage</li> <li>CPU memory useage</li> </ul> </li> <li> <p>Logs\u2014a log file of the current status. Use the download button to save the logs.</p> </li> </ul> <p>To hide the training details, press Hide details.</p>"},{"location":"Researcher/user-interface/workspaces/overview/","title":"Getting familiar with workspaces","text":"<p> Version 2.9</p> <p>Workspace is a simplified tool for researchers to conduct experiments, build AI models, access standard MLOps tools, and collaborate with their peers.</p> <p>Run:ai workspaces abstract complex concepts related to running containerized workloads in a Kubernetes environment. Aspects such as networking, storage, and secrets, are built from predefined abstracted setups, that ease and streamline the researcher's AI model development.</p> <p>A workspace consists of all the setup and configuration needed for the research, including container images, data sets, resource requests, as well as all required tools for the research, in a single place.  This setup is set to facilitate the research needs and yet to ensure infrastructure owners keep control and efficiency when supporting the various needs.</p> <p>A workspace is associated with a specific Run:ai project (internally: a Kubernetes namespace). A researcher can create multiple workspaces under a specific project.</p> <p>Researchers can only view and use workspaces that are created under projects they are assigned to.</p> <p></p> <p>Workspaces can be created with just a few clicks of a button. See Workspace creation.  </p> <p>Workspaces can be stopped and started to save expensive resources without losing complex environment configurations.</p> <p>Only when a workspace is in status active (see also Workspace Statuses) does it consume resources. </p> <p>When the workspace is active it exposes the connections to the tools (for example, a Jupyter notebook) within the workspace. </p> <p></p> <p>An active workspace is a Run:ai interactive workload. The interactive workload starts when the workspace is started and stopped when the workspace is stopped. </p> <p>Workspaces can be used via the user interface or programmatically via the Run:ai Admin API. Workspaces are not supported via the command line interface. You can still run an interactive workload via the command line. </p>"},{"location":"Researcher/user-interface/workspaces/overview/#next-steps","title":"Next Steps","text":"<ul> <li>Workspaces are made from building blocks. Read about the various building blocks</li> <li>See how to create a Workspace.  </li> </ul>"},{"location":"Researcher/user-interface/workspaces/statuses/","title":"Workspace Statuses","text":"<p>The Workspace\u2019s status mechanism displays the state of the workspace by aggregating various Kubernetes statuses into the following list:</p> Status Description Pending The workspace is waiting in queue and does not consume any resources. Initializing The workspace has been scheduled and it is consuming resources. Active The workspace is ready to be used and allows the researcher to connect. Stopped The workspace is currently unused and does not consume any resources Failed Something went wrong and the workspace is not usable. <p>This allows the researcher to quickly understand whether the workspace is ready to use and if resources are allocated to it. You can hover over the status column to see additional details about the workspace status.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/statuses/#pending-workspace","title":"Pending workspace","text":"<p>The Pending status indicates that the workspace is waiting in queue and does not consume any resources. The workspace will always end up in this state if the workspace was successfully activated but the relevant resources are unavailable.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#initializing-workspace","title":"Initializing workspace","text":"<p>The Initializing status indicates that the workspace has been scheduled and is consuming resources. However, it is not active yet as its container is still initializing (so it is not possible to connect to the container tools). This step can take anything from a few seconds to a couple of minutes depending on several factors such as the image size to be pulled. The workspace always goes through this state before the workspace turns active.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#active-workspace","title":"Active workspace","text":"<p>The Active status indicates that the workspace is ready to be used and allows the researcher to connect to its tools. At this status, the workspace is consuming resources and affecting the project\u2019s quota. The workspace will turn to active status once the <code>Active</code> button is pressed, the activation process ends up successfully and relevant resources are available and vacant.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#stopped-workspace","title":"Stopped workspace","text":"<p>The Stopped status indicates that the workspace is currently unused and does not consume any resources. A workspace can be stopped either manually, or automatically if triggered by idleness criteria set by the admin (see Limit duration of interactive Jobs).</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#failed-workspace","title":"Failed workspace","text":"<p>The Failed status indicates that something went wrong and the workspace is not usable. You must recreate the workspace and try again.</p>"},{"location":"Researcher/user-interface/workspaces/statuses/#transitioning-states","title":"Transitioning states","text":"<p>When the user attempts to delete, stop, or activate a workspace, the status column indicates a transition state which will either be successful or will fail. If the action fails, the workspace will stay in its original status. For example, if the user tries to delete an active workspace and fails, the workspace is left in active status. Transitioning states are only visible in the browser of the user.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/blocks/building-blocks/","title":"Workspace Building Blocks","text":"<p>Workspace building blocks are a layer that abstracts complex containers and Kubernetes concepts and provides simple and reusable tools to quickly allocate resources to the workspace. This way researchers need to interact only with the building blocks, and do not need to be aware of technical setups and configurations.</p> <p>Workspaces are built from the following building blocks:</p> <ol> <li>Environment</li> <li>Data source</li> <li>Compute resource</li> </ol> <p></p> <p>When a workspace is created, the researcher chooses from preconfigured building blocks or can create a new one on the fly. For example, a workspace can be composed of the following blocks:</p> <ul> <li>Environment: Jupyter, Tensor Board and Cuda 11.2</li> <li>Compute resource: 0.5 GPU, 8 cores and 200 Megabytes of CPU memory</li> <li>Data source: A Git branch with the relevant dataset needed</li> </ul> <p></p> <p>A building block has a scope. The scope links a building block to a specific Run:ai project or to all projects:   </p> <ul> <li>When a building block scope is a specific project. It can be viewed and used only within the project.</li> <li>A building block scope can also be set to all projects (current projects and also any future ones).</li> </ul> <p></p> <p>Typically, building blocks are created by the administrator and then assigned to a project. You can grant permission to the researchers to create their own building blocks. These building blocks will only be available to the projects that are assigned to the researcher that created them.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/building-blocks/#next-steps","title":"Next Steps","text":"<p>Read about the various building blocks Environments, Compute Resources and Data Sources.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/compute/","title":"Compute resource introduction","text":"<p>A compute resource building block represents a resource request to be used by the workspace (for example 0.5 GPU, 8 cores and 200 Megabytes of CPU memory). When a workspace is activated, the scheduler looks for a node that can fullfil the request. </p> <p>The compute resource is a mandatory building block for Workspace. A request is composed of the following resources: </p> <ul> <li>GPU resources</li> <li>CPU memory resources</li> <li>CPU cores resources</li> </ul> <p></p> <p>Note</p> <p>GPU resources can be requested as either a memory request, a full GPU request or a fraction of a GPU. A fraction of a GPU also supports the selection of a dynamic MIG profile if configured</p>"},{"location":"Researcher/user-interface/workspaces/blocks/compute/#see-also","title":"See Also","text":"<ul> <li>Create a Compute resource. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/blocks/datasources/","title":"Data source introduction","text":"<p>A data source is a location where data sets relevant to the research are stored. Workspaces can be attached to several data sources for reading and writing. The data can be located locally or in the cloud. Run:ai data sources can use a variety of storage technologies such as Git, S3, NFS, PVC, and more.  </p> <p>The data source is an optional building block for the creation of a workspace.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/blocks/datasources/#see-also","title":"See Also","text":"<ul> <li>Create a Data source. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/blocks/environments/","title":"Environment introduction","text":"<p>The environment block consists of the URL path for the container image and the image pull policy. It exposes all the necessary tools (open source, 3rd party, or custom tools) along with their connection interfaces (See also external node port and the container ports.</p> <p>An environment is a mandatory building block for the creation of a workspace. </p> <p></p> <p>You can also include commands, arguments, and environment variables, as well as the user identity with permission to run the commands in the container.</p> <p>Note</p> <p>Additional arguments and environment variables can be added to workspaces even if they were not defined in the environment building block used by the workspace. This ensures that the same environment can still serve many workspaces, even if they differ in their arguments and environment variables.</p>"},{"location":"Researcher/user-interface/workspaces/blocks/environments/#see-also","title":"See Also","text":"<ul> <li>Create an Environment. </li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/","title":"Create a new Compute Resource","text":"<p>To create a compute resource:</p> <ul> <li>Select the <code>New Compute Resource</code> button.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>Give the resource a meaningful name.</li> </ul> <p></p> <p>A compute resource, is assigned to a single project or all projects (current and future ones). The latter option can only be created by a Run:ai administrator. A compute resource, by design, is shared with all project members.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-the-resources-request","title":"Set the resources request","text":"<p>A resources request is composed of 3 types of resources:</p> <ol> <li>GPU</li> <li>CPU Memory</li> <li>CPU Compute</li> </ol> <p>The user can select one or more resources. For example, one compute resource may consist of a CPU resource request only, whereas a different request can consist of a CPU memory request and a GPU request.</p> <p></p> <p>Note</p> <p>Selecting resources more than the cluster can supply will result in a permanently failed workspace.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-gpu-resources","title":"Set GPU resources","text":"<p>GPU resources can be expressed in various ways:</p> <ol> <li>Request GPU devices: this option supports whole GPUs (e.g. 1 GPU, 2 GPUs, 3 GPUs) or a fraction of GPU (e.g. 0.1 GPU, 0.5 GPU, 0.93 GPU, etc.) </li> <li>Request partial memory of a single GPU device: this option allows to explicitly state the amount of memory needed (e.g. 5GB GPU RAM). </li> <li>Request a MIG profile: this option will dynamically provision the requested MIG profile (if the relevant hardware exists). </li> </ol> <p>Note</p> <ul> <li>Selecting a GPU fraction (e.g. 0.5 GPU) in a heterogeneous cluster may result in inconsistent results: For example, half of a V100 16GB GPU memory is different than A100 with 40GB). In such scenarios. Requesting specific GPU memory is a better strategy.</li> <li>When selecting partial memory of a single GPU device, if NVIDIA MIG is enabled on a node, then the memory can be provided as a MIG profile. For more information see Dynamic MIG. </li> <li>If GPUs are not requested, they will not be allocated even if resources are available. In that case, the project's GPU quota will not be affected.</li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-compute/#set-cpu-resources","title":"Set CPU resources","text":"<p>A CPU resource consists of cores and memory. When GPU resources are requested the user interface will automatically present a proportional amount of CPU cores and memory (as set on the cluster side). </p> <p>Note</p> <p>If no GPU, CPU and memory resources are defined, the request will not be allocated any GPUs. The scheduler will create a container with no minimal CPU and memory. Such a job will run but is likely to be preempted at any time by other jobs. The scheme is relevant for testing and debugging purposes.  </p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/","title":"Create a new data source","text":"<p>When you select <code>New Compute Resource</code> you will be presented with various data source options described below.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-an-nfs-data-source","title":"Create an NFS data source","text":"<p>To create an NFS data source, provide:</p> <ul> <li>A data source name.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>An NFS server.</li> <li>The path to the data within the server.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>The data can be set as read-write or limited to read-only permission regardless of any other user privileges.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-pvc-data-source","title":"Create a PVC data source","text":"<p>To create an PVC data source, provide:</p> <ul> <li>A data source name</li> <li>A Run:ai project scope</li> <li> <p>Select an existing PVC or create a new one by providing:</p> <ul> <li>a claim name</li> <li>a storage class</li> <li>access mode</li> <li>required storage size</li> <li>volume system mode</li> </ul> </li> <li> <p>The path within the container where the data will be mounted.</p> </li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-an-s3-data-source","title":"Create an S3 data source","text":"<p>S3 storage saves data in buckets. S3 is typically attributed to AWS cloud service but can also be used as a separate service unrelated to Amazon. </p> <p>To create an S3 data source, provide</p> <ul> <li>A data source name</li> <li>A Run:ai project scope</li> <li>The relevant S3 service URL server</li> <li>The bucket name of the data. </li> <li>The path within the container where the data will be mounted.</li> </ul> <p>Note that an S3 data source can be public or private. For the latter option, please select the relevant credentials associated with the project to allow access to the data.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-git-data-source","title":"Create a Git data source","text":"<p>To create a Git data source, provide:</p> <ul> <li>A data source name.</li> <li>A Run:ai project scope.</li> <li>The relevant repository URL.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>The Git data source can be public or private. To allow access to a private Git data source, you must select the relevant credentials associated with the project. </p>"},{"location":"Researcher/user-interface/workspaces/create/create-ds/#create-a-host-path-data-source","title":"Create a host path data source","text":"<p>To create a host path data source, provide:</p> <ul> <li>A data source name.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>The relevant path on the host.</li> <li>The path within the container where the data will be mounted.</li> </ul> <p>Note that the data can be limited to read-only permission regardless of any other user privileges.</p>"},{"location":"Researcher/user-interface/workspaces/create/create-env/","title":"Creating a new environment","text":"<p>To create an environment:</p> <ol> <li>In the left menu, press New Environment.</li> <li>In the Scope pane, choose one item from the tree. The compute resource is assigned to that item and all its subsidiaries.</li> <li>Enter an Environment name.</li> <li>Enter the image URL path and an image pull policy.</li> <li> <p>Select a tool from the list. You can add multiple tools by pressing *+ Tool+. Selecting a tool is optional.</p> <p>Tools can be:</p> <ul> <li>Different applications such as Code editor IDEs (for example, VS Code), Experiment tracking (for example,. Weight and Biases), visualization tools (for example,. Tensor Board), and more.</li> <li>Open source tools (for example, Jupyter notebook) or commercial 3rd party tools (for example,. MATLAB)</li> </ul> <p>It is also possible to set up a custom tool used by the organization.</p> <p>For each tool, you must set the type of connection interface and port. If not set, default values are provided. The supported connection types are:</p> <ul> <li>External URL:  This connection type allows you to connect to your tool either by inserting a custom URL or having one generated for you. Either way, the URL should be unique per workspace as many workspaces may use the same environment. If the URL type was set to custom, the URL will be requested from the Researcher upon creating the workspace.</li> <li>External node port: A NodePort exposes your application externally on every host of the cluster, access the tool using <code>http://&lt;HOST_IP&gt;:&lt;NODEPORT&gt;</code> (for example, http://203.0.113.20:30556).</li> </ul> <p>Note</p> <p>Selecting a tool requires configuration to be up and running. To configure a tool:</p> <ul> <li>The container image needs to support the tool. </li> <li>The administrator must configure a DNS record and certificate. For more information, see Workspaces configuration.</li> </ul> </li> <li> <p>Configure runtime settings with:</p> <ol> <li>Commands and arguments\u2014visible, but not editable in the workspace creation form.</li> <li>Environment variables\u2014visible and editable in the workspace creation form.</li> <li>Set the container's working directory.</li> </ol> <p>Note</p> <p>The value of an environment variable can remain empty for the researcher to fill in when creating a workspace.</p> </li> <li> <p>Configure the security settings from:</p> <ol> <li>Settings in the image\u2014security settings that come with the image file. </li> <li> <p>Custom settings:</p> <pre><code>1. User ID.\n2. Group ID.\n3. Supplementary Groups.\n4. Values modification settings.\n</code></pre> </li> <li> <p>Add linux capabilities.</p> </li> </ol> </li> </ol>"},{"location":"Researcher/user-interface/workspaces/create/workspace/","title":"Workspaces actions and use cases","text":""},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace","title":"Create a new workspace","text":"<p>A Workspace is assigned to a project and is affected by the project\u2019s quota just like any other workload. A workspace is shared with all project members for collaboration.</p> <p>To create a workspace, you must provide:</p> <ul> <li>At least one project </li> <li>A researcher assigned to at least one project</li> </ul> <p>To create a workspace, the researcher must select building blocks  in one of two ways:</p> <ul> <li>Create a workspace from scratch:  this allows you to either select an existing building block or create them on the fly (pending the right permissions).</li> <li>Create a workspace from a template: a template contains a set of predefined building blocks as well as additional configurations which allow the user to immediately create a templated-based workspace.</li> </ul> <p>Note</p> <p>Where there is a card gallery, use the search bar to find specific cards based on title or field values.</p> <p>To create a workspace:</p> <ul> <li>Press <code>New Workspace</code> </li> <li>Select a project for the new workspace. The project visualization contains information about the project such as how much of the quota is being allocated and indicates the likelihood of the workspace being scheduled or left in the queue</li> </ul>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace-from-scratch","title":"Create a new workspace from scratch","text":"<p>See picture:</p> <p></p> <p>Note</p> <p>The building block can also be created (and then selected) directly from within the workspace creation form.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-an-environment-for-a-new-workspace","title":"Select an Environment for a new workspace","text":"<p>An environment is a mandatory element of a workspace. All environments created for the project will be shown to researchers in the form of a gallery view (see also Creating a new environment). Each tile shows the tools as well as the image. When selecting an environment, the command, arguments and environment variables defined in the environment are visible for review. The researcher can edit arguments and environment variables that are specific to the current workspace and that are not part of the common shared environment. In some cases, it would even be expected that the researcher will provide additional information (for example, values for environment variables) to successfully create the workspace (see also Create new environment).</p> <p></p> <p>You can also decide whether the workspace is preemptible or not (see also create a preemptible worksapce). By default, interactive sessions are limited to the project\u2019s GPU, meaning that they can only be scheduled (and activated) when there is an available and sufficient GPU quota.  With the following parameter, the researcher can determine whether the workspace is allowed to go over-quota with the understanding that it can be preempted if other projects would demand back their quota.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-a-compute-resource-for-a-new-workspace","title":"Select a compute resource for a new workspace","text":"<p>Selecting compute resources for the workspace is a mandatory step. If compute resources are created for the project (see also creating a new compute resource), those will be offered to researchers in the form of a gallery view. Each tile shows the amount of GPU, CPU and Memory in the request.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#select-a-data-source-for-a-new-workspace","title":"Select a data source for a new workspace","text":"<p>Selecting a data source for the workspace is a non-mandatory step. If data sources are created for the project (see also creating a new compute resource), those will be offered to researchers in the form of a gallery view. Each tile shows the unique name of the building block and the type of data source.</p> <p></p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-new-workspace-from-a-template","title":"Create a new workspace from a template","text":"<p>Templates ease the way of creating a new workspace in a few clicks. In contrast to creating a workspace from scratch (selecting manually which building blocks to use in your workspace), a template aggregates all building blocks under a single entity for researchers to use for the creation of workspaces.</p> <p></p> <p>A Template consists of the building blocks and other parameters that are exposed in a workspace creation form. Templates can be fully defined to a point researcher can select and create the workspace without providing any additional information or partially defined, hence, leaving some degree of freedom in the creation of the workspace via the template. This can help in cases where only part of the configuration is selected in the template and the rest is expected to be provided by the user creating a workspace from the template. </p> <p>Few examples: </p> <ul> <li>A template can have the value of an environment variable empty for the researcher to edit later during the workspace creation.</li> <li>A template can consist of an environment with a tool that requests a custom URL. This URL field stays empty until the researcher fills it upon creating the workspace</li> </ul> <p>For collaboration purposes, templates are assigned to a specific project and are shared with all project members by design.</p>"},{"location":"Researcher/user-interface/workspaces/create/workspace/#create-a-preemptible-workspace","title":"Create a preemptible workspace","text":"<p>For a better experience, workspaces, as they are built for interactive research, are designed to not be preempted (because the researchers actively interact with GPU resources). Thus, non-preemptible workspaces can be only scheduled if the project has a sufficient vacant quota. However, if that\u2019s not the case (the project does not have a sufficient vacant quota) and the researcher still needs to create and activate a workspace (if cluster resources are available) he/she can allow the workspace to go over-quota, thus be scheduled, but with the cost of preemption without prior notice.</p> <p></p>"},{"location":"admin/overview-administrator/","title":"Overview: Administrator Documentation","text":"<p>The role of Administrators is to set up Run:ai and perform day-to-day monitoring and maintenance. </p> <p>As part of the Administrator documentation you will find:</p> <ul> <li>Run:ai Setup How to set up and modify a GPU cluster with Run:ai.</li> <li>Researcher Setup How to set up Researchers to work with Run:ai.</li> <li>How to configure Workloads and Workload Policies.</li> <li>Setting and maintaining the cluster via the  Run:ai User Interface.</li> <li>Troubleshooting Run:ai and understanding cluster health.</li> <li>Integrations of Run:ai with a variety of other systems.</li> </ul>"},{"location":"admin/admin-ui-setup/admin-ui-users/","title":"Adding, Updating and Deleting Users","text":""},{"location":"admin/admin-ui-setup/admin-ui-users/#introduction","title":"Introduction","text":"<p>The Run:ai User Interface allows the creation of Run:ai Users. Run:ai Users can receive varying levels of access to the Administration UI and submit Jobs on the Cluster.</p> <p>Tip</p> <p>It is possible to connect the Run:ai user interface to the organization's directory and use single sign-on. This allows you to set Run:ai roles for users and groups from the organizational directory. For further information see single sign-on configuration.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#working-with-users","title":"Working with Users","text":"<p>You can create users, as well as update and delete users.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#create-a-user","title":"Create a User","text":"<p>Note</p> <p>To be able to review, add, update and delete users, you must have an Administrator access. If you do not have such access, please contact an Administrator.</p> <p> Department Admin is available in version 2.10 and later.</p> <ol> <li>Login to the Users area of the Run:ai User interface at <code>company-name.run.ai</code>.</li> <li>Select the <code>Users</code> tab for local users, or the <code>SSO Users</code> tab for SSO users.</li> <li>On the top right, select \"NEW USER\".</li> <li>Enter the user's email.</li> <li> <p>Select Roles. More than one role can be selected. Available roles are:</p> <ul> <li>Administrator\u2014Can manage Users and install Clusters.</li> <li>Editor\u2014Can manage Projects and Departments.</li> <li>Viewer\u2014View-only access to the Run:ai User Interface.</li> <li>Researcher\u2014Can submit ML workloads. Setting a user as a Researcher also requires assigning the user to projects.</li> <li>Research Manager\u2014Can act as Researcher in all projects, including new ones to be created in the future.</li> <li>ML Engineer\u2014Can view and manage deployments and cluster resources. Available only when Inference module is installed.</li> <li>Department Administrator\u2014Can manage Departments, descendent Projects and Workloads.</li> </ul> <p>For more information, Roles and permissions.</p> </li> <li> <p>(Optional) Select Cluster(s). This determines what Clusters are accessible to this User.</p> </li> <li>Press \"Save\".</li> </ol> <p>You will get the new user credentials and have the option to send the credentials by email.</p>"},{"location":"admin/admin-ui-setup/admin-ui-users/#roles-and-permissions","title":"Roles and permissions","text":"<p>Roles provide a way to group permissions and assign them to either users or user groups. The role identifies the collection of permissions that administrators assign to users or user groups. Permissions define the actions that users can perform on the managed entities. The following table shows the default roles and permissions.</p> Managed Entity   /  Roles Admin Dep. Admin Editor Research Manager Researcher ML Eng. Viewer Assign (Settings) Users/Groups/Apps to Roles CRUD (all roles) CRUD (Proj. Researchers and ML Engineers only) N/A N/A N/A N/A N/A Assign Users/Groups/Apps to Organizations R (Projects, Departments) CRUD (Projects only) CRUD (Projects, Departments) N/A N/A N/A N/A Departments R R CRUD N/A N/A R R Projects R CRUD CRUD R R R R Jobs R R R R CRUD N/A R Deployments R R R N/A N/A CRUD R Workspaces R R R R CRUD N/A N/A Environments CRUD CRUD CRUD CRUD CRUD N/A N/A Data Sources CRUD CRUD CRUD CRUD CRUD N/A N/A Compute Resources CRUD CRUD CRUD CRUD CRUD N/A N/A Templates CRUD CRUD CRUD CRUD CRUD N/A N/A Clusters CRUD N/A R N/A N/A R R Node Pools CRUD N/A R N/A N/A R R Nodes R N/A R N/A N/A R R Settings (General, Credentials) CRUD N/A N/A N/A N/A N/A N/A Events History R N/A N/A N/A N/A N/A N/A Dashboard.Overview R R R R R R R Dashboards.Analytics R R R R R R R Dashboards.Consumption R N/A N/A N/A N/A N/A N/A <p>Permissions:    C = Create, R = Read, U = Update, D = Delete</p>"},{"location":"admin/admin-ui-setup/credentials-setup/","title":"Credentials","text":"<p>Credentials are used to unlock protected resources such as applications, containers, and other assets.</p>"},{"location":"admin/admin-ui-setup/credentials-setup/#types-of-credentials","title":"Types of credentials","text":"<p>The Credential manager in the Run:ai environment supports 4 types of credentials:</p> <ol> <li>Docker registry.</li> <li>Access key.</li> <li>User name and password.</li> <li>Kubernetes Secrets.</li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#configuring-credentials","title":"Configuring Credentials","text":"<p>Prerequisites</p> <ol> <li><code>Workspaces</code> are enabled.</li> <li>Target resource user-id and password for creating a secret in the UI.</li> <li>Configured pre-existing secrets with the applicable <code>label</code>.</li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#docker-registry","title":"Docker registry","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Docker registry</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.</p> </li> <li> <p>If you choose <code>New secret</code>, enter a username and password.</p> </li> </ul> </li> <li> <p>Enter a URL for the docker registry, then press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#access-key","title":"Access key","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Access key</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.  </p> </li> <li> <p>If you choose <code>New secret</code>, enter an access key and access secret.</p> </li> </ul> </li> <li> <p>Press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#username-and-password","title":"Username and password","text":"<ol> <li>Go to <code>Settings | Credentials</code>.</li> <li>Press <code>New Credentials</code>.</li> <li>Select <code>Username &amp; password</code>.</li> <li> <p>In the <code>Project</code> field, select a project from the drop down.</p> <ul> <li>Choose <code>All</code> to add the credential to all current and future projects.</li> </ul> </li> <li> <p>In the <code>Credential name</code> field, enter a name for the credential.</p> </li> <li> <p>In the <code>Secret</code> field, choose from <code>Existing secret</code> or <code>New secret</code>.</p> <ul> <li> <p>If you select <code>Existing secret</code>, select an unused secret from the drop down.</p> <p>Note</p> <p>Existing secrets can't be used more than once.</p> </li> <li> <p>If you choose <code>New secret</code>, enter a username and password.</p> </li> </ul> </li> <li> <p>Press <code>Create credential</code> to create the credential.</p> </li> </ol>"},{"location":"admin/admin-ui-setup/credentials-setup/#kubernetes-created-secret","title":"Kubernetes created secret","text":"<p>You can use the Kubernetes Secrets creation tool to create a pre-existing secret that can be used when creating the credential. You must <code>label</code> these secrets so that they are registered in the Run:ai environment.</p> <p>The following command makes the secret available to all projects in the cluster.</p> <pre><code>kubectl label secret -n runai &lt;SECRET_NAME&gt; run.ai/cluster-wide-credentials=true\n</code></pre> <p>The following command makes the secret available to a specific project in the cluster.</p> <pre><code>kubectl label secret -n &lt;NAMESPACE_OF_PROJECT&gt; &lt;SECRET_NAME&gt; run.ai/credentials=true\n</code></pre>"},{"location":"admin/admin-ui-setup/credentials-setup/#user-id-and-password","title":"User-id and password","text":"<p>You can create a credential using a user-id and password. Use the user-id and password of the target resource.</p>"},{"location":"admin/admin-ui-setup/credentials-setup/#docker-registry_1","title":"Docker registry","text":""},{"location":"admin/admin-ui-setup/dashboard-analysis/","title":"Introduction","text":"<p>The Run:ai Administration User Interface provides a set of dashboards that help you monitor Clusters, Cluster Nodes, Projects, and Jobs. This document provides the key metrics to monitor, how to assess them as well as suggested actions.</p> <p>There are 5 dashboards:</p> <ul> <li>Overview dashboard\u2014Provides information about what is happening right now in the cluster.</li> <li>Quota Management dashboard\u2014Provides information about quota utilization.</li> <li>Analytics dashboard\u2014Provides long term analysis of cluster behavior.</li> <li>Multi-Cluster Overview dashboard\u2014Provides a more holistic, multi-cluster view of what is happening right now. The dashboard is intended for organizations that have more than one connected cluster.</li> <li>Consumption dashboard\u2014Provides information about resource consumption.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#overview-dashboard","title":"Overview Dashboard","text":"<p>The Overview dashboard provides information about what is happening right now in the cluster.  Administrators can view high-level information on the state of the cluster, including:</p> <ul> <li>The number of available and allocated resources and their cluster-wide utilization.</li> <li>The number of running and pending Jobs, their utilization, information on Jobs with errors or Jobs with idle GPUs.</li> <li>Active Projects, their assigned and allocated GPUs and number of running and pending Jobs.</li> </ul> <p>The dashboard has a dropdown filter for node pools. From the dropdown, select one or more node pools. The default setting is <code>all</code>.</p> <p>Cluster administrators can use the Overview dashboard to find issues and fix them. Below are a few examples:</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-idle-gpus","title":"Jobs with idle GPUs","text":"<p>Locate Jobs with idle GPUs, defined as GPUs with 0% GPU utilization for more than 5 minutes.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis  &amp; Actions Interactive Jobs are too frequently idle *  Consider setting time limits for interactive Jobs through the Projects tab.\u00a0 *  Consider also reducing GPU quotas for specific Projects to encourage users to run more training Jobs as opposed to interactive Jobs (note that interactive Jobs can not use more than the GPU quota assigned to their Project). Training Jobs are too frequently idle Identify and notify the right users and work with them to improve the utilization of their training scripts"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-an-error","title":"Jobs with an Error","text":"<p>Search for Jobs with an error status. These Jobs may be holding GPUs without actually using them.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Search for Jobs with an Error status on the Jobs view and discuss with the Job owner. Consider deleting these Jobs to free up the resources for other users.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#jobs-with-a-long-duration","title":"Jobs with a Long Duration","text":"<p>View list of 5 longest Jobs.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis &amp; Actions Training Jobs run for too long Ask users to view their Jobs and analyze whether useful work is being done. If needed, stop their Jobs. Interactive Jobs run for too long Consider setting time limits for interactive Jobs via the Project editor."},{"location":"admin/admin-ui-setup/dashboard-analysis/#job-queue","title":"Job Queue","text":"<p>Identify queueing bottlenecks.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> Review Analysis &amp; Actions Cluster is fully loaded Go over the table of active Projects and check that fairness between Projects was enforced, by reviewing the number of allocated GPUs for each Project, ensuring each Project was allocated with its fair-share portion of the cluster. Cluster is not fully loaded Go to the Jobs view to review the resources requested for that Job (CPU, CPU memory, GPU, GPU memory). Go to the Nodes view to verify that there is no Node with enough free resources that can host that Job. <p>Also, check the command that the user used to submit the job. The Researcher may have requested a specific Node for that Job.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-management-dashboard","title":"Quota management dashboard","text":"<p>The Quota management dashboard provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard is divided into sections with essential metrics and data visualizations to identify resource usage patterns, potential bottlenecks, and areas for optimization. The sections of the dashboard include:</p> <ul> <li>Add Filter</li> <li>Quota / Total</li> <li>Allocated / Quota</li> <li>Pending workloads</li> <li>Quota by node pool</li> <li>Allocation by node pool</li> <li>Pending workloads by node pool</li> <li>Departments with lowest allocation by node pool</li> <li>Projects with lowest allocation ratio by node pool</li> <li>Over time allocation / quota</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#add-filter","title":"Add Filter","text":"<p>Use the Add Filter dropdown to select filters for the dashboard. The filters will change the data shown on the dashboard. Available filters are:</p> <ul> <li>Departments</li> <li>Projects</li> <li>Nodes</li> </ul> <p>Select a filter from the dropdown, then select a item from the list, and press apply.</p> <p>Note</p> <p>You can create a filter with multiple categories, but you can use each category and item only once.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-total","title":"Quota / Total","text":"<p>This section shows the number of GPUs that are in the quota based on the filter selection. The quota of GPUs is the number of GPUs that are reserved for use.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#allocated-quota","title":"Allocated / Quota","text":"<p>This section shows the number of GPUs that are allocated based on the filter selection. Allocated GPUs are the number of GPUs that are being used.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-workloads","title":"Pending workloads","text":"<p>This section shows the number workloads that are pending based on the filter selection. Pending workloads are workloads that have not started.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#quota-by-node-pool","title":"Quota by node pool","text":"<p>This section shows the quota of GPUs by node pool based on the filter. The quota is the number of GPUs that are reserved for use. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#allocation-by-node-pool","title":"Allocation by node pool","text":"<p>This section shows the allocation of GPUs by node pool based on the filter. The allocation is the number of GPUs that are being used. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-workloads-by-node-pool","title":"Pending workloads by node pool","text":"<p>This section shows the number of pending workloads by node pool. You can drill down into the data in this section by pressing on the graph or the link at the bottom of the section.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#departments-with-lowest-allocation-by-node-pool","title":"Departments with lowest allocation by node pool","text":"<p>This section shows the departments with the lowest allocation of GPUs by percentage relative to the total number of GPUs.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#projects-with-lowest-allocation-ratio-by-node-pool","title":"Projects with lowest allocation ratio by node pool","text":"<p>This section shows the projects with the lowest allocation of GPUS by percentage relative to the total number of GPUs.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#over-time-allocation-quota","title":"Over time allocation / quota","text":"<p>This section shows the allocation of GPUs from the quota over a period of time.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#analytics-dashboard","title":"Analytics Dashboard","text":"<p>The Analytics dashboard provides means for viewing historical data on cluster information such as:</p> <ul> <li>Utilization across the cluster</li> <li>GPU usage by different Projects, including allocation and utilization, broken down into interactive and training Jobs</li> <li>Breakdown of running Jobs into interactive, training, and GPU versus CPU-only Jobs, including information on queueing (number of pending Jobs and requested GPUs),</li> <li>Status of Nodes in terms of availability and allocated and utilized resources.</li> </ul> <p>The dashboard has a dropdown filter for node pools and Departments. From the dropdown, select one or more node pools. The default setting is <code>all</code>.</p> <p>The information presented in Analytics can be used in different ways for identifying problems and fixing them. Below are a few examples.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#node-downtime","title":"Node Downtime","text":"<p>View the overall available resources per Node and identify cases where a Node is down and there was a reduction in the number of available resources.</p> <p>How to: view the following panel.</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Filter according to time range to understand for how long the Node is down.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#gpu-allocation","title":"GPU Allocation","text":"<p>Track GPU allocation across time.</p> <p>How to: view the following panels.</p> <p></p> <p>The panel on the right-hand side shows the cluster-wide GPU allocation and utilization versus time, whereas the panels on the left-hand side show the cluster-wide GPU allocation and utilization averaged across the filtered time range.</p> <p>Analysis and Suggested actions:</p> <p>If the allocation is too low for a long period, work with users to run more workloads and to better utilize the Cluster.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#track-gpu-utilization","title":"Track GPU utilization","text":"<p>Track whether Researchers efficiently use the GPU resources they have allocated for themselves.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>If utilization is too low for a long period, you will want to identify the source of the problem:</p> <ul> <li>Go to \u201cAverage GPU Allocation &amp; Utilization\u201d</li> <li>Look for Projects with large GPU allocations for interactive Jobs or Projects that poorly utilize their training Jobs. Users tend to poorly utilize their GPUs in interactive sessions because of the dev &amp; debug nature of their work which typically is an iterative process with long idle GPU time. On many occasions users also don\u2019t shut down their interactive Jobs, holding their GPUs idle and preventing others from using them.</li> </ul> Review Analysis &amp; Actions Low GPU utilization is due to interactive Jobs being used too frequently Consider setting time limits for interactive Jobs through the Projects tab or reducing GPU quotas to encourage users to run more training Jobs as opposed to interactive Jobs (note that interactive Jobs can not use more than the GPU quota assigned to their Project). Low GPU utilization is due to users poorly utilizing their GPUs in training sessions Identify Projects with bad GPU utilization in training Jobs, notify the users and work with them to improve their code and the way they utilize their GPUs."},{"location":"admin/admin-ui-setup/dashboard-analysis/#training-vs-interactive-researcher-maturity","title":"Training vs. Interactive -- Researcher maturity","text":"<p>Track the number of running Jobs and the breakdown into interactive, training, and CPU-only Jobs.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>We would want to encourage users to run more training Jobs than interactive Jobs, as it is the key to achieving high GPU utilization across the Cluster:</p> <ul> <li>Training Jobs run to completion and free up their resources automatically when training ends</li> <li>Training Jobs can be preempted, queued, and resumed automatically by the Run:ai system according to predefined policies which increases fairness and Cluster utilization.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#pending-queue-size","title":"Pending Queue Size","text":"<p>Track how long is the queue for pending Jobs</p> <p>How to: view the following panels:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>Consider buying more GPUs:</p> <ul> <li>When there are too many Jobs are waiting in queue for too long.</li> <li>With a large number of requested GPUs.</li> <li>While the Cluster is fully loaded and well utilized.</li> </ul>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#cpu-memory-utilization","title":"CPU &amp; Memory Utilization","text":"<p>Track CPU and memory Node utilization and identify times where the load on specific Nodes is high.</p> <p>How to: view the following panel:</p> <p></p> <p>Analysis and Suggested actions:</p> <p>If the load on specific Nodes is too high, it may cause problems with the proper operation of the Cluster and the way jobs are running.</p> <p>Consider adding more CPUs, or adding additional CPU-only nodes for Jobs that do only CPU processing.</p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#multi-cluster-overview-dashboard","title":"Multi-Cluster overview dashboard","text":"<p>Provides a holistic, aggregated view across Clusters, including information about Cluster and Node utilization, available resources, and allocated resources. With this dashboard, you can identify Clusters that are down or underutilized and go to the Overview of that Cluster to explore further.</p> <p></p>"},{"location":"admin/admin-ui-setup/dashboard-analysis/#consumption-dashboard","title":"Consumption dashboard","text":"<p>This dashboard enables users and admins to view consumption usage using run:AI services. The dashboard provides views based on configurable filters and timelines.</p> <p></p> <p>The dashboard has 4 dashlets for:</p> <ul> <li>Cumulative GPU allocation per Project or Department</li> <li>Cumulative CPU allocation per Project or Department</li> <li>Cumulative memory allocation per Project or Department</li> <li>Consumption types</li> </ul> <p>Use the drop down menus at the top of the dashboard to apply filters for:</p> <ul> <li>Project or department</li> <li>Per project (single, multiple, or all)</li> <li>Per department (single, multiple or all)</li> </ul> <p>Use the time picker dropdown to select relative time range options and set custom absolute time ranges. You can change the Timezone and fiscal year settings from the time range controls by clicking the Change time settings button.</p> <p>Note</p> <p>Dashboard data updates once an hour.</p> <p></p> <p>You can change the refresh interval using the refresh interval drop down.</p> <p>The dashboard has a Total consumption table that displays the total consumption of resources based on:</p> <ul> <li>Project</li> <li>Department</li> <li>GPU hours</li> <li>CPU hours</li> <li>Memory hours</li> </ul> <p>Hover over an entry in the table to filter it in or out of the table.</p> <p>The dashboard has a graph of the GPU allocation over time.</p> <p>!</p> <p>The dashboard has a graph of the Project over-quota GPU consumtion.</p> <p>!</p>"},{"location":"admin/admin-ui-setup/department-setup/","title":"Introduction","text":"<p>Researchers submit Jobs. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as create segregation between different initiatives. A project in most cases represents a team, an individual, or an initiative that shares resources or has a specific resources budget (quota).</p> <p>A Researcher submitting a Job needs to associate a Project name with the request. The Run:ai scheduler will compare the request against the current allocations and the Project and determine whether the workload can be allocated resources or whether it should remain in the queue for future allocation.</p> <p>In some organizations, Projects may not be enough, this is because:</p> <ul> <li>There are simply too many individual entities that are attached to a quota.</li> <li>There are organizational quotas at a higher level.</li> </ul>"},{"location":"admin/admin-ui-setup/department-setup/#departments","title":"Departments","text":"<p>Departments create a secondary hierarchy of resource allocation:</p> <ul> <li>A Project is associated with a single Department. Multiple Projects can be associated with the same Department.</li> <li>A Department, like a Project is associated with a Quota. </li> <li>It is recommended that a Department's quota supersedes the sum of all its associated Projects' quota.</li> </ul>"},{"location":"admin/admin-ui-setup/department-setup/#node-pools-and-quota-settings","title":"Node Pools and Quota settings","text":"<p>For detailed information on node pools, see Using node pools.</p> <p>By default, all nodes in a cluster are part of the <code>Default</code> node pool. The administrator can choose to create new node pools and include a set of nodes in a node pool by associating the nodes with a label.</p> <p>If the node pools feature is disabled, all GPU and CPU resources are directly associated with the Department's Quota. </p> <p>Once an Administrator enables node pools, all GPU and CPU resources will be included in the <code>Default</code> node pool and summed up to the Department's overall Quotas.</p> <p>An administrator can create a new node pool and associate nodes into this pool. Any new pool is automatically associated with all Departments and Projects within a cluster, with a GPU and CPU resource Quota of zero. The Administrator can then change the Quota of any node-pool resource per Department and Project. The Quota of node-pool X within Department Y should be at least the sum of the same node-pool X Quota across all associated Projects. This means an administrator should carefully plan the resource Quota allocation from the Department to its descendent Projects. The overall Quota of the Department is the sum of all its associated node pools. </p>"},{"location":"admin/admin-ui-setup/department-setup/#over-quota-behavior","title":"Over-quota behavior","text":"<p>Consider an example from an academic use case: the Computer Science Department and the GeoPhysics Department have each purchased 10 nodes with 8 GPUs for each node, totaling a cluster of 160 GPUs for both departments. The two Departments do not mind sharing GPUs as long as they always get their 80 GPUs when they truly need them. As such, there could be many Projects in the GeoPhysics Department, totaling an allocation of 100 GPUs, but anything above 80 GPUs will be considered by the Run:ai scheduler as over-quota. For more details on over-quota scheduling see the Run:ai Scheduler. In case node pools are enabled, the same rule applies per node pool, i.e. if a job tries to use resources that supersede a node pool Department's quota - it will be considered as Over-Quota.</p> <p>Important</p> <p>Best practice: As a rule, the sum of the Departments' Quota allocations should be equal to the number of GPUs in the cluster.</p>"},{"location":"admin/admin-ui-setup/department-setup/#creating-and-managing-departments","title":"Creating and Managing Departments","text":""},{"location":"admin/admin-ui-setup/department-setup/#enable-departments","title":"Enable Departments","text":"<p>Departments are disabled by default. To start working with Departments:</p> <ul> <li>Go to <code>Settings</code> | <code>General</code>.</li> <li>Enable <code>Departments</code>.</li> </ul> <p>Once Departments are enabled, the left-side menu will have a new item named Departments.</p> <p>Under Departments there will be a single department named default. All Projects created before the Department feature was enabled will belong to the default department.</p>"},{"location":"admin/admin-ui-setup/department-setup/#adding-departments","title":"Adding Departments","text":"<p>To add a new department:</p> <ol> <li>In the Departments grid, press New Department.</li> <li>Enter a name.</li> <li>In Quota management configure the number GPUs, CPUs, and CPU memory.</li> <li>In Access control select a user or application to be department administrator. If there are no users assigned the role of department administrator, see Assigning Department Administrator role.</li> </ol>"},{"location":"admin/admin-ui-setup/department-setup/#assigning-department-administrator-role","title":"Assigning Department Administrator role","text":"<p>You can create a new user with the Department Administrator role, or add the role to existing users. To create a new user with this role, see Create a user. To add this role to an existing user:</p> <ol> <li>Go to <code>Settings | Users</code>.</li> <li>Select a user from the list and then press <code>Edit User</code>.</li> <li>Select the <code>Department Admin</code> role from the list. (Deselect to remove the role from the user).</li> <li>Press save when complete.</li> </ol> <p>After you have created the user with the Department Administrator role, you will need to assign the user to the correct department.</p> <p>To assign the Department Administrator user to the correct department:</p> <ol> <li>Go to <code>Settings | Departments</code>.</li> <li>Select a department from the list, then press <code>Edit</code>. If you do not have a department, you will need to create one. See Adding a new department.</li> <li>Select <code>Department Administrator</code>, then select <code>Users</code> or <code>Applications</code>.</li> <li>If you selected <code>Users</code>, select one or more users from the drop down menu.</li> <li>Press save when complete.</li> </ol>"},{"location":"admin/admin-ui-setup/department-setup/#assigning-projects-to-departments","title":"Assigning Projects to Departments","text":"<p>Under Projects edit an existing Project. You will see a new Department drop-down with which you can associate a Project with a Department.</p>"},{"location":"admin/admin-ui-setup/deployments/","title":"Viewing and Submitting Deployments","text":"<p>The Run:ai User interface Deployment area allows the viewing and submitting of Deployments for serving inference workloads. Submitting inference workloads can only be done if your user has <code>ML Engineer</code> access.</p>"},{"location":"admin/admin-ui-setup/deployments/#deployment-list","title":"Deployment list","text":"<p>The main view shows a list of Deployments:</p> <p></p>"},{"location":"admin/admin-ui-setup/deployments/#submit-a-deployment","title":"Submit a Deployment","text":"<p>On the top right, you can choose to Submit a new Deployment. </p> <p>Note</p> <p>If knative is not installed in your cluster the button will be grayed out.</p> <p>A Deployment form will open: </p> <p></p> <p>Note</p> <p>If the Deploy button is disabled or does not exist, then your cluster is not installed or configured to connect to the cluster see here for more information.</p>"},{"location":"admin/admin-ui-setup/deployments/#deployment-properties","title":"Deployment Properties","text":"<p>When selecting a single Deployment, a right-pane appears:</p> <p></p> <p>This multi-tab view provides information about Deployment details, related Pods, Deployment status history, and various utilization graphs. </p>"},{"location":"admin/admin-ui-setup/jobs/","title":"Viewing and Submitting Jobs","text":"<p>The Run:ai User interface Job area allows the viewing of Jobs and Job details. It also allows the Researcher to submit Jobs, suspend and resume Jobs and delete Jobs.</p>"},{"location":"admin/admin-ui-setup/jobs/#job-list","title":"Job list","text":"<p>The main view shows a list of Jobs. The list can be filtered and sorted:</p> <p></p>"},{"location":"admin/admin-ui-setup/jobs/#submit-a-job","title":"Submit a Job","text":"<p>On the top right, you can choose to Submit a new Job. A Job form will open:</p> <p></p> <p>Note</p> <p>If the Submit Job button is disabled or does not exist, then your cluster is not installed or configured to connect to the cluster see here for more information.</p>"},{"location":"admin/admin-ui-setup/jobs/#job-properties","title":"Job Properties","text":"<p>When selecting a single Job, a right-pane appears:</p> <p></p> <p>This multi-tab view provides information about Job details, related Pods, Job status history, and various utilization graphs. You can also view internal Job logs as shown here:</p> <p></p>"},{"location":"admin/admin-ui-setup/jobs/#other-operations","title":"Other operations","text":"<p>You can also delete a selected Job or suspend/resume a selected Job.</p>"},{"location":"admin/admin-ui-setup/overview/","title":"User Interface Overview","text":"<p>Run:ai provides a single user interface that, depending on your role, serves both as a control-plane management tool and a researcher workbench. </p> <p>The control-plane part of the tool allows the administrator to:</p> <ul> <li>Analyze cluster status using dashboards.</li> <li>Manage Run:ai metadata such as users, departments, and projects. </li> <li>View Job details to be able to help researchers solve Job-related issues.</li> </ul> <p>The researcher workbench part of the tool allows Researchers to submit, delete and pause Jobs, view Job logs etc.</p>"},{"location":"admin/admin-ui-setup/overview/#setup","title":"Setup","text":"<p>The cluster installation process requires configuring a new cluster and downloading a YAML file.  On SaaS-based installations, the cluster creation wizard requires a URL to the cluster as explained here.</p>"},{"location":"admin/admin-ui-setup/overview/#architecture","title":"Architecture","text":"<ul> <li>Run:ai saves metadata such as users, projects, departments, clusters, and tenant settings, in the control plane residing on the Run:ai cloud.</li> <li>Workload information resides on (sometimes multiple) GPU clusters. </li> <li>The Run:ai user interface needs to work with both sources of information. </li> </ul> <p>As such, the chosen architecture of the user interface is:</p> <p></p> <ul> <li>The user interface is served from the management backend.</li> <li>The user interface connects directly to multiple GPU clusters using cross-origin access. This works using CORS: Cross-origin resource sharing. This allows submitting workloads and getting extended logging information directly from the GPU clusters. </li> <li>Meta-data, such as Projects, Settings, and Job information is synced into the management backend via a cluster-sync service. Cluster-sync creates an outbound-only channel with no incoming HTTPS connections.  </li> </ul> <p>Important</p> <p>One corollary of this architecture is that for SaaS-based tenants, the user interface will only be able to access the cluster when the browser is inside the corporate firewall. When working outside the firewall. Workload-related functionality such as Submitting a Job, viewing Job lots etc, is disabled. </p>"},{"location":"admin/admin-ui-setup/project-setup/","title":"Introduction","text":"<p>Researchers submit Jobs. To streamline resource allocation and prioritize work, Run:ai introduces the concept of Projects. Projects are the tool to implement resource allocation policies as well as create segregation between different initiatives. A project in most cases represents a team, an individual, or an initiative that shares resources or has a specific resources budget (quota).</p> <p>A Researcher submitting a Job needs to associate a Project name with the request. The Run:ai scheduler will compare the request against the current allocations and the Project and determine whether the workload can be allocated resources or whether it should remain in the queue for future allocation.</p>"},{"location":"admin/admin-ui-setup/project-setup/#modeling-projects","title":"Modeling Projects","text":"<p>As an Admin, you need to determine how to model Projects. You can:</p> <ul> <li>Set a Project per user.</li> <li>Set a Project per team of users.</li> <li>Set a Project per a real organizational Project.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#node-pools","title":"Node Pools","text":"<p>For detailed information on node pools, see Using node pools.</p> <p>By default, all nodes in a cluster are part of the <code>Default</code> node pool. The administrator can choose to create new node pools and include a set of nodes in a node pool by associating the nodes with a label.</p> <p>Each node pool is automatically associated with all Projects and Departments with zero resource allocation (Quotas). When submitting a Job (or Deployment), the Researcher can choose one or more node pools. When choosing more than one node pool, the researcher sets the order of priority between the chosen node pools. The scheduler will try to schedule the Job to the first node pool. If not successful the scheduler will try the second node pool in the list, and so forth until it finds a node pool that can provide the Job's specification.</p> <p>An administrator can set a Project's <code>default priority list</code> of node pools. In case the Researcher did not specify any node pool (or node pool list), the scheduler will use the Project's default node pool priority list to determine the order that the scheduler will use when scheduling the Job.</p>"},{"location":"admin/admin-ui-setup/project-setup/#project-quotas","title":"Project Quotas","text":"<p>Each Project is associated with a total quota of GPU and CPU resources (CPU Compute &amp; CPU Memory) that can be allocated for the Project at the same time. This total is the sum of all node pools' quotas associated with this Project. This is guaranteed quota in the sense that Researchers using this Project are guaranteed to get this amount of GPU and CPU resources, no matter what the status in the cluster is.</p> <p>Beyond that, a user of this Project can receive an over-quota (The administrator needs to enable over-quota per project). As long as GPUs are unused, a Researcher using this Project can get more GPUs. However, these GPUs can be taken away at a moment's notice. When the node pools flag is enabled, over-quota is effective and calculated per node pool, this means that a workload requesting resources from a certain node pool can get its resources from a quota that belongs to another Project for the same node pool if the resources are exhausted for this Project and available on another Project. For more details on over-quota scheduling see the Run:ai Scheduler.</p> <p>Important</p> <p>Best practice: As a rule, the sum of the Projects' allocations should be equal to the number of GPUs in the cluster.</p>"},{"location":"admin/admin-ui-setup/project-setup/#controlling-over-quota-behavior","title":"Controlling Over-Quota Behavior","text":"<p>By default, the amount of over-quota available for Project members is proportional to the original quota provided above. The Run:ai scheduler document provides further examples which show how over-quota is distributed amongst competing Projects.</p> <p>As an administrator, you may want to disconnect the two parameters. So, for example, a Project with a high quota will receive little or no over-quota. To perform this:</p> <ul> <li>Under <code>Settings | General</code> turn on the <code>Enable Over-quota Priority</code> feature</li> <li>When creating a new Project, you can now see a slider for over-quota priority ranging from <code>None</code> to <code>High</code></li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#create-a-project","title":"Create a Project","text":"<p>Note</p> <p>To be able to create or edit Projects, you must have Editor access. See the Users documentation.</p> <ol> <li>In the left-menu, press Projects. 1.5 On the top right, select \"Add New Project\"</li> <li>Choose a Department from the drop-down. The default is <code>default</code>.</li> <li>Enter a Project name. Press Namespace to set the namespace associated with the project. You can either create the namespace from the project name (default) or enter an existing namespace.</li> <li>In Access control, add one or more applications or users. If your user or application isn't in the list, see Roles and permissions, and verify that the users have the correct permissions. To change user permissions, see Working with users.</li> <li> <p>In Quota management, configure the node pool priority (if editable), the GPUs, CPUs, CPU memory, and Over-quota priority settings. Configure the following:</p> <ul> <li>Order of priority\u2014the priority the node pool will receive when trying to schedule workloads. For more information, see Node pool priority.</li> <li>GPUs\u2014the number of GPUs in the node pool. Press GPUs and enter the number of GPUs, then press Apply to save.</li> <li>CPUs(Cores)\u2014the number of CPU cores in the node pool. Press CPUs and enter the number of GPUs, then press Apply to save.</li> <li>CPU Memory\u2014the amount of memory the CPUs will be allocated. Press CPU Memory, enter an amount of memory, then press Apply to save.</li> <li>Over-quota priority\u2014the priority for the specific node pool to receive over-quota allocations.</li> </ul> </li> <li> <p>(Optional) In the Scheduling rules pane, use the dropdown arrow to open the pane. Press on the + Rule button to add a new rule to the project. Add one (or more) of the following rule types:</p> <ul> <li>Idle GPU timeout\u2014controls the amount of time that specific workload GPUs which are idle will be remain assigned to the project before getting reassigned.</li> <li>Workspace duration\u2014limit the length of time a workspace will before being terminated.</li> <li>Training duration\u2014limit the length of time training workloads will run.</li> <li>Node type (Affinity)\u2014limits specific workloads to run on specific node types.</li> </ul> </li> </ol>"},{"location":"admin/admin-ui-setup/project-setup/#assign-users-to-project","title":"Assign Users to Project","text":"<p>When Researcher Authentication is enabled, the Project form will contain an additional Access Control tab. The tab will allow you to assign Researchers to their Projects.</p> <p>If you are using Single-sign-on, you can also assign Groups</p>"},{"location":"admin/admin-ui-setup/project-setup/#other-project-properties","title":"Other Project Properties","text":""},{"location":"admin/admin-ui-setup/project-setup/#limit-jobs-to-run-on-specific-node-groups","title":"Limit Jobs to run on Specific Node Groups","text":"<p>You can assign a Project to run on specific nodes (machines). This is achieved by two different mechanisms:</p> <ul> <li> <p>Node Pools:         All node pools in the system are associated with each Project. Each node pool can allocate GPU and CPU resources (CPU Compute &amp; CPU Memory) to a Project. By associating a quota on specific node pools for a Project, you can control which nodes a Project can utilize and which default priority order the scheduler will use (in case the workload did choose so by itself). Each workload should choose the node pool(s) to use, if no choice is made, it will use the Project's default 'node pool priority list'. Note that node pools with zero resources associated with a Project or node pools with exhausted resources can still be used by a Project when the Over-quota flag is enabled.</p> </li> <li> <p>Node Affinities (aka Node Type)         Administrator can associate specific node sets characterized by a shared run-ai/node-type label value to a Project. This means descendant workloads can only use nodes from one of those node affinity groups. A workload can specify which node affinity to use, out of the list is bounded to its parent Project.</p> </li> </ul> <p>There are many use cases and reasons to use specific nodes for a Project and its descendant workloads, here are some examples:</p> <ul> <li>The project team needs specialized hardware (e.g. with enough memory).</li> <li>The project team is the owner of specific hardware which was acquired with a specialized budget.</li> <li>We want to direct build/interactive workloads to work on weaker hardware and direct longer training/unattended workloads to faster nodes.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#the-difference-between-node-pools-and-affinities","title":"The difference between Node Pools and Affinities","text":"<p>Node pools represent an independent scheduling domain per Project, therefore are completely segregated from each other. To use a specific node pool (or node pools), any workload must specify the node pool(s) it would like to use. While for affinities, workloads that ask for a specific affinity will only be scheduled to nodes marked with that affinity, while workloads that did not specify any affinity might be scheduled as well to those nodes with an affinity. Therefore the scheduler cannot guarantee quota for node affinities, only to node pools.</p> <p>Note that using node pools and affinities narrows down the scope of nodes a specific project is eligible to use. It, therefore, reduces the odds of a specific workload under that Project getting scheduled. In some cases, this may reduce the overall system utilization.</p>"},{"location":"admin/admin-ui-setup/project-setup/#grouping-nodes-using-node-pools","title":"Grouping Nodes using Node Pools","text":"<p>To create a node pool you must first annotate nodes with a label or use an existing node label, as the key for grouping nodes into pools. You can use any unique label (in the format <code>key:value</code>) to form a node pool. a node pool is characterized by a label but also has its own unique node pool name.</p> <p>To get the list of nodes and their current labels, run:</p> <pre><code>kubectl get nodes --show-labels\n</code></pre> <p>To annotate a specific node with the label <code>dgx-2</code>, run:</p> <pre><code>kubectl label node &lt;node-name&gt; node-model=dgx-2\n</code></pre> <p>You can annotate multiple nodes with the same label.</p> <p>To create a node pool with the chosen common label use the create node pool Run:ai API.</p>"},{"location":"admin/admin-ui-setup/project-setup/#setting-node-pools-for-a-specific-project","title":"Setting Node Pools for a Specific Project","text":"<p>By default, all node pools are associated with every Project and Department using zero resource allocation. This means that by default any Project can use any node-pool if Over-Quota is set for that Project, but only for preemptible workloads (i.e. Training workloads or Interactive using Preemptible flag).</p> <ul> <li>To guarantee resources for all workloads including non-preemptible workloads, the administrator should allocate resources in node pools.</li> <li>Go to the Node Pools tab under Project and set a quota to any of the node pools (GPU resources, CPU resources) you want to use.</li> <li>To set the Project's default node pool's order of priority, you should set the precedence of each node pool, this is done in the Project's node pool tab.</li> <li>The node pool default priority order is used if the workload did not specify its preferred node pool(s) list of priority.</li> <li>To mandate a Workload to run on a specific node pool, the Researcher should specify the node pool to use for a workload.</li> <li>If no node-pool is specified - the Project's 'Default' node-pool priority list is used.</li> <li>Press 'Save' to save your changes.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#grouping-nodes-using-node-affinities","title":"Grouping Nodes using Node Affinities","text":"<p>To set node affinities, you must first annotate nodes with labels. These labels will later be associated with Projects.</p> <p>To get the list of nodes, run:</p> <pre><code>kubectl get nodes\n</code></pre> <p>To annotate a specific node with the label \"dgx-2\", run:</p> <pre><code>kubectl label node &lt;node-name&gt; run.ai/type=dgx-2\n</code></pre> <ul> <li>Each node can only be annotated with a single label.</li> <li>You can annotate multiple nodes with the same label.</li> </ul>"},{"location":"admin/admin-ui-setup/project-setup/#setting-affinity-for-a-specific-project","title":"Setting Affinity for a Specific Project","text":"<p>To mandate training Jobs to run on specific node groups:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Node Affinity tab and set a limit to specific node groups.</li> <li>If the label does not yet exist, press the + sign and add the label.</li> <li>Press Enter to save the label.</li> <li>Select the label.</li> </ul> <p>To mandate interactive Jobs to run on specific node groups, perform the same steps under the \"interactive\" section in the Project dialog.</p>"},{"location":"admin/admin-ui-setup/project-setup/#further-affinity-refinement-by-the-researcher","title":"Further Affinity Refinement by the Researcher","text":"<p>The Researcher can limit the selection of node groups by using the CLI flag <code>--node-type</code> with a specific label. When setting specific Project affinity, the CLI flag can only be used with a node group out of the previously chosen list.  See CLI reference for further information runai submit</p>"},{"location":"admin/admin-ui-setup/project-setup/#limit-duration-of-interactive-and-training-jobs","title":"Limit Duration of Interactive and Training Jobs","text":"<p>As interactive sessions involve human interaction, Run:ai provides an additional tool to enforce a policy that sets the time limit for such sessions. This policy is often used to handle situations like researchers leaving sessions open even when they do not need to access the resources.</p> <p>Warning</p> <p>This feature will cause containers to automatically stop. Any work not saved to a shared volume will be lost.</p> <p>To set a duration limit for interactive Jobs:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Time Limit tab</li> <li>You can limit interactive Jobs using two criteria:<ul> <li>Set a hard time limit (day, hour, minute) to an Interactive Job, regardless of the activity of this Job, e.g. stop the Job after 1 day of work.</li> <li>Set a time limit for Idle Interactive Jobs, i.e. an Interactive Job idle for X time is stopped. Idle means no GPU activity.</li> <li>You can set if this idle time limit is effective for Interactive Jobs that are Preemptible, non-Preemptible, or both.</li> </ul> </li> </ul> <p>The setting only takes effect for Jobs that have started after the duration has been changed.</p> <p>In some use cases, you would like to stop Training Jobs if X time elapsed since they have started to run. This can be to clean up stale Training Jobs or Jobs that are running for too long probably because of wrong parameters set or other errors of the model.</p> <p>To set a duration limit for Training Jobs:</p> <ul> <li>Create a Project or edit an existing Project.</li> <li>Go to the Time Limit tab:<ul> <li>Set a time limit for Idle Training Jobs, i.e. a Training Job idle for X time is stopped. Idle means no GPU activity.</li> </ul> </li> </ul> <p>The setting only takes effect for Jobs that have started after the duration has been changed.</p>"},{"location":"admin/admin-ui-setup/project-setup/#see-also","title":"See Also","text":"<p>Run:ai supports an additional (optional) level of resource allocation called Departments.</p>"},{"location":"admin/integration/airflow/","title":"Integrate Run:ai with Apache Airflow","text":"<p>Airflow is a platform to programmatically author, schedule, and monitor workflows. Specifically, it is used in Machine Learning to create pipelines.  </p>"},{"location":"admin/integration/airflow/#airflow-dag","title":"Airflow DAG","text":"<p>In Airflow, a DAG \u2013 or a Directed Acyclic Graph \u2013 is a collection of all the tasks you want to run, organized in a way that reflects their relationships and dependencies.</p> <p>A DAG is defined in a Python script, which represents the DAGs structure (tasks and their dependencies) as code.</p> <p>For example, a simple DAG could consist of three tasks: A, B, and C. It could say that A has to run successfully before B can run, but C can run anytime. It could say that task A times out after 5 minutes, and B can be restarted up to 5 times in case it fails. It might also say that the workflow will run every night at 10 pm, but shouldn\u2019t start until a certain date.</p> <p>Airflow tasks are sent for execution. Specifically, the Airflow - Kubernetes integration allows Airflow tasks to be scheduled on a Kubernetes cluster. </p>"},{"location":"admin/integration/airflow/#runai-airflow-integration","title":"Run:ai - Airflow Integration","text":"<p>DAGs are defined in Python. Airflow tasks based on Kubernetes are defined via the KubernetesPodOperator class.  To run an Airflow task with Run:ai you must provide additional, Run:ai-related, properties to </p> <p><pre><code>dag = DAG(...)\nresources = {\n\"limit_gpu\": &lt;number-of-GPUs&gt;\n}\njob = KubernetesPodOperator(\nnamespace='runai-&lt;project-name&gt;',\nimage='&lt;image-name&gt;',\nlabels={\"project\": '&lt;project-name&gt;'},\nname='&lt;task-name&gt;',\ntask_id='&lt;task-name&gt;',\nget_logs=True,\nschedulername='runai-scheduler',\nresources=resources,\ndag=dag\n)\n</code></pre> The code:</p> <ul> <li>Specifies the runai-scheduler which directs the task to be scheduled with the Run:ai scheduler</li> <li>Specifies a Run:ai Project. A Project in Run:ai specifies guaranteed GPU &amp; CPU quota.  </li> </ul> <p>Once you run the DAG, you can see Airflow tasks shown in the Run:ai UI. </p>"},{"location":"admin/integration/argo-workflows/","title":"Integrate Run:ai with Argo Workflows","text":"<p>Argo Workflows is an open source container-native workflow engine for orchestrating parallel jobs on Kubernetes.</p> <p>This document describes the process of using Argo Workflows in conjunction with Run:ai. Argo Workflows submits jobs that are scheduled via Run:ai.</p>"},{"location":"admin/integration/argo-workflows/#install-argo-workflows","title":"Install Argo Workflows","text":"<p>Use the default installation to install Argo Workflows. As described in the documentation, open the Argo Workflows UI by running: </p> <pre><code>kubectl -n argo port-forward deployment/argo-workflows-server 2746:2746\n</code></pre> <p>Then browse to localhost:2746</p>"},{"location":"admin/integration/argo-workflows/#create-a-runai-project","title":"Create a Run:ai Project","text":"<p>Using the Run:ai user interface, create a Run:ai Project. A Project named <code>team-a</code> will create a Kubernetes namespace named <code>runai-team-a</code>.</p>"},{"location":"admin/integration/argo-workflows/#run-an-argo-workflow-with-runai","title":"Run an Argo Workflow with Run:ai","text":""},{"location":"admin/integration/argo-workflows/#create-an-argo-workflows-template","title":"Create an Argo Workflows Template","text":"<p>Within the Argo Workflows user interface, go to <code>Templates</code> and create a new Template. Add the following metadata:</p> <pre><code>spec:\ntemplates:\n- name: &lt;WORKFLOW-NAME&gt;\nmetadata:\nlabels:\nproject: team-a # (1)\n</code></pre> <ol> <li>Name of Project.</li> </ol>"},{"location":"admin/integration/argo-workflows/#create-and-run-the-workflow","title":"Create and Run the Workflow","text":"<p>Create an Argo Workflow from the template and run it. Open the Run:ai user interface, go to <code>Jobs</code>, and verify that you can see the new Job. </p>"},{"location":"admin/integration/argo-workflows/#using-gpu-fractions-with-argo-workflows","title":"Using GPU Fractions with Argo Workflows","text":"<p>To run an Argo Workflow using GPU Fractions, you will need to add an <code>annotation</code>:</p> <pre><code>spec:\ntemplates:\n- name: &lt;WORKFLOW-NAME&gt;\nmetadata:\nannotations:\ngpu-fraction: '0.5' # (1)\nlabels:\nproject: team-a # (2)\n</code></pre> <ol> <li>Size of required GPU Fraction.</li> <li>Name of Project.</li> </ol>"},{"location":"admin/integration/clearml/","title":"Integrate Run:ai with ClearML","text":"<p>ClearML is an open-source and commercial platform to manage the ML lifecycle. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler. </p>"},{"location":"admin/integration/clearml/#overview","title":"Overview","text":"<p>ClearML concepts are discussed here. Specifically see ClearML Kubernetes architecture.</p>"},{"location":"admin/integration/clearml/#terminology","title":"Terminology","text":"<ul> <li>Run:ai uses Projects. A Project is assigned to users and contains information such as quota, affinity, and more. A Run:ai Project is implemented as a Kubernetes namespace. </li> <li>ClearML allows the Researcher to run Experiments. Experiment is equivalent to a Run:ai Job. A ClearML Experiment is sent to a ClearML Queue for execution. </li> <li>ClearML execute Agents. An agent runs on a Kubernetes namespace. An Agent is configured to watch a Queue. The Agent fetches an experiment from the queue for execution within the Kubernetes namespace.</li> </ul>"},{"location":"admin/integration/clearml/#step-by-step-instructions","title":"Step by Step Instructions","text":""},{"location":"admin/integration/clearml/#prerequisites","title":"Prerequisites","text":"<ul> <li>A working Run:ai cluster.</li> <li>Install ClearML via ClearML helm charts. Once ClearML is installed, verify that the installation is working by running:</li> </ul> <pre><code>kubectl get pod -n clearml\n</code></pre> <p>See that all pods are up. </p>"},{"location":"admin/integration/clearml/#preparations","title":"Preparations","text":"<p>To prepare a Run:ai Project and a ClearML Queue do the following:</p> <ul> <li>In ClearML, create a queue named <code>runai-clearml</code>.</li> <li>In Run:ai, create a project named <code>clearml</code>. This will create a namespace called <code>runai-clearml</code></li> <li>Associate the queue and the project by running:</li> </ul> <pre><code>kubectl get role -n clearml k8sagent-pods-access -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get rolebinding -n clearml k8sagent-pods-access -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get secret -n clearml clearml-conf -ojson | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | kubectl create -f -\nkubectl get configmap -n clearml k8sagent-pod-template -ojson | sed 's@tolerations:\\\\n    {}@tolerations:\\\\n    []@g' | jq '.metadata.namespace=\"runai-clearml\"' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq 'del(.metadata.creationTimestamp)' | jq '.data[\"template.yaml\"]=(.data[\"template.yaml\"] + \"  schedulerName: runai-scheduler\")' | kubectl create -f -\nkubectl get deployment -n clearml clearml-k8sagent -ojson | sed 's/clearml-apiserver/clearml-apiserver.clearml.svc.cluster.local/; s/clearml-webserver/clearml-webserver.clearml.svc.cluster.local/; s/clearml-fileserver/clearml-fileserver.clearml.svc.cluster.local/; s@--template-yaml /root/template/template.yaml@--template-yaml /root/template/template.yaml --namespace runai-clearml@; s/k8s-agent/runai-k8s-agent/; s/aws-instances/runai-clearml/' | jq 'del(.status)' | jq 'del(.metadata.creationTimestamp)' | jq 'del(.metadata.generation)' | jq 'del(.metadata.uid)' | jq 'del(.metadata.resourceVersion)' | jq '.metadata.namespace=\"runai-clearml\"' | kubectl create -f -\n</code></pre> <p>Note</p> <p>The script is hardcoded for the above queue name and Run:ai Project name. You can change the script accordingly.</p> <p>Validate that the Queue and the Project are connected by running:</p> <pre><code>kubectl get pod -n runai-clearml\n</code></pre> <p>You should see a ClearML agent running inside the Run:ai namespace. </p>"},{"location":"admin/integration/clearml/#running-an-experiment","title":"Running an Experiment","text":"<ul> <li>Using the ClearML interface create an experiment and enqueue it to the <code>runai-clearml</code> queue.</li> <li>Go to the Run:ai user interface. Under <code>Jobs</code> see that the job was created. </li> </ul>"},{"location":"admin/integration/comet/","title":"Integrate Run:ai with Comet","text":"<p>Comet builds tools that help data scientists, engineers, and team leaders accelerate and optimize machine learning and deep learning models. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler.</p> <p>Data scientists and ML engineers choose the Comet platform because it has the flexibility required for the most iterative data science teams, and it is built to handle the intense demands of enterprise ML at scale.</p> <p>To configure Comet integration:</p> <ol> <li>Login to your account in Comet. If you do not have a valid account, you will need to create one.</li> <li>Setup your Comet account here.</li> <li>In your Run:ai account, create an environment and set Comet as a tool then:</li> <li>Enter the following <code>&lt;COMET_results_URL&gt;</code></li> <li>Add an environment variable:<pre><code>```Key = COMET_results_URL```\n\n```Value = enter the URL destination for the results```\n</code></pre> </li> </ol> <p>The researcher must then create a Workspace and select the Comet tool.</p> <p>To configure the Comet tool, for the environemnt variable name <code>COMET_results_URL</code> value, enter the ULR of the destination where the results are to be delivered.</p> <p>This will create a link, that will automatically open a new tab directly from your Workspace to your exact Comet project.</p>"},{"location":"admin/integration/deepspeed/","title":"DeepSpeed Integration with Run:ai","text":""},{"location":"admin/integration/deepspeed/#working-with-deepspeed-on-top-of-runai","title":"Working with DeepSpeed on top of Run:ai","text":"<p>DeepSpeed is a deep learning optimization library for PyTorch designed to reduce computing power and memory use, and to train large distributed models with better parallelism on existing computer hardware. DeepSpeed is optimized for low latency, high throughput training. It also includes the Zero Redundancy Optimizer (ZeRO) for training models with 1 trillion or more parameters. Other features include mixed precision training, single-GPU, multi-GPU, multi-node training, and custom model parallelism.</p> <p>This article describes how to run a distributed workload on Kubernetes using an MPIJob with DeepSpeed.</p>"},{"location":"admin/integration/deepspeed/#prerequisites","title":"Prerequisites","text":"<p>Prerequisites to run DeepSpeed on a Run:ai cluster:</p> <ul> <li> <p>Kubernetes Cluster version <code>1.21</code> or later</p> </li> <li> <p>Run:ai Cluster version <code>2.9</code> or later</p> </li> <li> <p>Kubeflow MPIOperator version <code>v0.4.0</code> or later</p> </li> </ul> <p>Note</p> <p>Earlier versions may work but weren't tested.</p>"},{"location":"admin/integration/deepspeed/#ai-workload-cifar10","title":"AI Workload - Cifar10","text":"<p>This article uses the  <code>Cifar 10</code> dataset to show how to work with DeepSpeed on Run:ai. This dataset contains thousands of images and an image recognition model. For more information about the <code>Cifar 10</code> model go to: CIFAR-10 and CIFAR-100 datasets(toronto.edu)</p> <p>Microsoft has released an examples of DeepSpeed training models in the following repository: microsoft/DeepSpeedExamples: Example models using DeepSpeed(github.com)</p> <p>We will use the <code>Cifar 10</code> model which can be found in <code>training/cifar</code> directory. In this directory we can find the following relevant files:</p> <ul> <li> <p><code>cifar10_tutorial.py</code>\u2014run the training without DeepSpeed.</p> </li> <li> <p><code>cifar10_deepspeed.py</code>\u2014run the training with DeepSpeed.</p> </li> <li> <p><code>run_ds.sh</code>\u2014Entrypoint for running the training.</p> </li> <li> <p><code>ds_config.json</code>\u2014DeepSpeed configuration file.</p> </li> </ul>"},{"location":"admin/integration/deepspeed/#docker-image","title":"Docker Image","text":"<p>A Docker image needs to be prepared so that the workload can be submitted. It will run a an MPIJob that supports distributed workloads using OpenMPI. The image also needs to have an SSH server for the workers, an SSH client for the launcher, and the model files for the remote commands.</p> <p>To create the Docker image:</p> <pre><code>FROM deepspeed/deepspeed\nWORKDIR /home/deepspeed        #inherit from deepspeed/deepspeed as base image, for having the DeepSpeed tools available\nRUN git clone https://github.com/microsoft/DeepSpeedExamples.git #imports the model files to the image\nWORKDIR /home/deepspeed/DeepSpeedExamples/training/cifar\nRUN pip install -r requirements.txt #install dependencies\nRUN ssh-keygen -t rsa -N \\\"\\\" -f /home/deepspeed/.ssh/id_rsa #generate SSH keys\nRUN cp /home/deepspeed/.ssh/id_rsa.pub\n/home/deepspeed/.ssh/authorized_keys\nRUN sudo chmod 644 /etc/ssh/\\*\nRUN sudo chmod 700 /home/deepspeed\nRUN sudo chmod 700 /home/deepspeed/.ssh\nRUN sudo mkdir /tmp\nRUN sudo chmod 777 /tmp\n</code></pre> <p>After completing the configuration, use the following command to build the image:</p> <pre><code>docker build . -t gcr.io/run-ai-lab/user/deepspeed\n</code></pre>"},{"location":"admin/integration/deepspeed/#interactive-workflow","title":"Interactive Workflow","text":"<p>Use the UI, CLI, or the API to run the workload.</p> <p>Using the CLI:</p> <pre><code>runai submit-dist mpi \\--workers 2 -i\ngcr.io/run-ai-lab/user/deepspeed-example -g 1 \\--command -p team-a \\--\nsleep infinity\n</code></pre> <p>This command runs an MPI job with 2 processes (workers), each with 1 GPU. The entry point for the launcher is <code>sleep infinity</code> and provides access to the container.</p> <p>In the CLI, run the DeepSpeed command as follows:</p> <pre><code>deepspeed --hostfile /etc/mpi/hostfile cifar10_deepspeed.py --deepspeed --deepspeed_config ds_config.json\n</code></pre> <p>Note</p> <p>Typically DeepSpeed looks for the <code>hostfile</code> in <code>/job/hostfile</code>. However, MPIOperator is</p> <p>puts the file in <code>/etc/mpi/hostfile</code>.</p>"},{"location":"admin/integration/jupyterhub/","title":"Connect JupyterHub with Run:ai","text":""},{"location":"admin/integration/jupyterhub/#overview","title":"Overview","text":"<p>A Jupyter Notebook is an open-source web application that allows you to create and share documents that contain live code. Uses include data cleaning and transformation, numerical simulation, statistical modeling, data visualization, machine learning, and much more. Jupyter Notebooks are popular with Researchers as a way to code and run deep-learning code. A Jupyter Notebook runs inside the user container. For more information, see Using a Jupyter Notebook within a Run:ai Job.</p> <p>JupyterHub is a separate service that makes it possible to serve pre-configured data science environments. </p> <p>This document explains how to set up JupyterHub to integrate with Run:ai such that Notebooks spawned via JuptyerHub will use resources scheduled by Run:ai.</p>"},{"location":"admin/integration/jupyterhub/#installing-jupyterhub","title":"Installing JupyterHub","text":"<p>This document follows the JupyterHub installation documentation</p>"},{"location":"admin/integration/jupyterhub/#create-a-namespace","title":"Create a namespace","text":"<p>Run:</p> <pre><code>kubectl create namespace jhub\n</code></pre>"},{"location":"admin/integration/jupyterhub/#provide-access-roles","title":"Provide access roles","text":"<pre><code>kubectl apply -f https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/jhubroles.yaml\n</code></pre>"},{"location":"admin/integration/jupyterhub/#create-storage","title":"Create storage","text":"<p>JupyterHub requires storage in the form of a PersistentVolume (PV). For an example of a local PV:</p> <ul> <li>Download https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/pv-example.yaml </li> <li>Replace <code>&lt;NODE-NAME&gt;</code> with one of your worker nodes. </li> <li>The example PV refers to <code>/srv/jupyterhub</code>. Log on to <code>&lt;NODE-NAME&gt;</code> and create the folder and run <code>sudo chmod 777 -R /srv/jupyterhub</code></li> </ul> <p>Then run:</p> <pre><code>kubectl apply -f pv-example.yaml \n</code></pre> <p>Note</p> <p>The JupyterHub installation will create a PersistentVolumeClaim named <code>hub-db-dir</code> that should be referred to by any PV you create.</p>"},{"location":"admin/integration/jupyterhub/#create-a-configuration-file","title":"Create a configuration file","text":"<p>Create a configuration file for JupyterHub. An example configuration file for Run:ai can be found in https://raw.githubusercontent.com/run-ai/docs/master/install/jupyterhub/config.yaml. It contains 3 sample Run:ai configurations. </p> <ul> <li>Download the file </li> <li>Replace <code>&lt;SECRET-TOKEN&gt;</code> with a random number generated, by running <code>openssl rand -hex 32</code></li> </ul>"},{"location":"admin/integration/jupyterhub/#install","title":"Install","text":"<p>Run:</p> <pre><code>helm repo add jupyterhub https://jupyterhub.github.io/helm-chart/\nhelm repo update\nhelm install jhub jupyterhub/jupyterhub -n jhub  --version=0.11.1 --values config.yaml\n</code></pre>"},{"location":"admin/integration/jupyterhub/#verify-installation","title":"Verify Installation","text":"<p>Run: </p> <pre><code>kubectl get pods -n jhub\n</code></pre> <p>Verify that all pods are running</p>"},{"location":"admin/integration/jupyterhub/#access-jupyterhub","title":"Access JupyterHub","text":"<p>Run:</p> <pre><code>kubectl get service -n jhub proxy-public\n</code></pre> <p>Use the <code>External IP</code> of the service to access the service.</p> <p>Login with Run:ai Project name as user name.</p>"},{"location":"admin/integration/jupyterhub/#troubleshooting-the-jupyterhub-installation","title":"Troubleshooting the JupyterHub Installation","text":"<p>If the <code>External IP</code> of the proxy-public service remains in the <code>Pending</code> status, it might mean that this service is not configured with an <code>External IP</code> by default.</p> <p>To fix, find out which pod is the proxy pod running on.</p> <p>Run: </p> <pre><code>kubectl get pods -n jhub -l component=proxy -o=jsonpath='{.items[0].spec.nodeName}{\"\\n\"}'\n</code></pre> <p>This will print the node that the proxy pod is running on. You will need to get both the internal and external IPs of this node for the next step. </p> <p>Now, let's check the proxy-public service definition. Run:</p> <pre><code>kubectl edit svc proxy-public -n jhub\n</code></pre> <p>Under <code>spec</code> You should see a section <code>externalIPs</code>. If it does not exist, you must add it there. The section must contain both the external and the internal IPs of the proxy pod, for example:</p> <pre><code>spec:\nexternalIPs:\n- 35.224.44.230\n- 10.8.0.9\n</code></pre> <p>Save the file and then try to access JupyterHub by using the external IP from the previous step in your browser.</p> <p>Caution</p> <p>Jupyter hub integration does not currently work properly when the Run:ai Project name includes a hyphen ('-'). We are working to fix that. </p>"},{"location":"admin/integration/kubeflow/","title":"Integrate Run:ai with Kubeflow","text":"<p>Kubeflow is a platform for data scientists who want to build and experiment with ML pipelines. Kubeflow is also for ML engineers and operational teams who want to deploy ML systems to various environments for development, testing, and production-level serving.</p> <p>This document describes the process of using Kubeflow in conjunction with Run:ai. Kubeflow submits jobs that are scheduled via Run:ai.</p> <p>Kubeflow is a set of technologies. This document discusses Kubeflow Notebooks and Kubeflow Pipelines.</p>"},{"location":"admin/integration/kubeflow/#install-kubeflow","title":"Install Kubeflow","text":"<p>Use the default installation to install Kubeflow.</p>"},{"location":"admin/integration/kubeflow/#install-runai-cluster","title":"Install Run:ai Cluster","text":"<p>When installing Run:ai, customize the cluster installation as follows:</p> <ul> <li>Set <code>createNamespaces</code> to <code>false</code>, as Kubeflow uses its own namespace convention.</li> </ul>"},{"location":"admin/integration/kubeflow/#create-runai-projects","title":"Create Run:ai Projects","text":"<p>Kubeflow uses the namespace convention <code>kubeflow-&lt;username&gt;</code>. Use the 4 steps here to set up Run:ai projects and link them with Kubeflow namespaces. </p> <p>Verify that the association has worked by running:</p> <pre><code>kubectl get rolebindings -n &lt;KUBEFLOW-USER-NAMESPACE&gt;\n</code></pre> <p>See that role bindings starting with <code>runai-</code> were created.</p>"},{"location":"admin/integration/kubeflow/#kubeflow-users-and-kubernetes-namespaces","title":"Kubeflow, Users and Kubernetes Namespaces","text":"<p>Kubeflow has a multi-user architecture. A user has a Kubeflow profile which maps to a Kubernetes Namespace. This is similar to the Run:ai concept where a Run:ai Project is mapped to a Kubernetes namespace.</p>"},{"location":"admin/integration/kubeflow/#kubeflow-notebooks","title":"Kubeflow Notebooks","text":"<p>When starting a Kubeflow Notebook, you select a <code>Kubeflow configuration</code>. A Kubeflow configuration allows you to inject additional settings into the notebook, such as environment variables. To use Kubeflow with Run:ai you will use configurations to inject:</p> <ul> <li>The name of the Run:ai project</li> <li>Allocation of a fraction of a GPU, if required</li> </ul>"},{"location":"admin/integration/kubeflow/#whole-gpus","title":"Whole GPUs","text":"<p>To use Run:ai with whole GPUs (no fractions), apply the following configuration:</p> <pre><code>apiVersion: kubeflow.org/v1alpha1\nkind: PodDefault\nmetadata:\nname: runai-non-fractional\nnamespace: &lt;KUBEFLOW-USER-NAMESPACE&gt;\nspec:\ndesc: \"Use Run:ai scheduler (whole GPUs)\"\nenv:\n- name: RUNAI_PROJECT value: \"&lt;PROJECT&gt;\"\nselector:\nmatchLabels:\nrunai-non-fractional: \"true\"  # key must be identical to metadata.name\n</code></pre> <p>Where <code>&lt;KUBEFLOW-USER-NAMESPACE&gt;</code> is the name of the namespace associated with the Kubeflow user and <code>&lt;PROJECT&gt;</code> is the name of the Run:ai project.</p> <p>Important</p> <p>Jobs should not be submitted within the same namespace where Kubeflow Operator is installed.</p> <p>Within the Kubeflow Notebook creation form, select the new configuration as well as the number of GPUs required.</p>"},{"location":"admin/integration/kubeflow/#fractions","title":"Fractions","text":"<p>The Kubeflow Notebook creation form only allows the selection of 1, 2, 4, or 8 GPUs. It is not possible to select a portion of a GPU (e.g. 0.5). As such, within the form, select <code>None</code> in the GPU box together with the following configuration:</p> <p><pre><code>apiVersion: kubeflow.org/v1alpha1\nkind: PodDefault\nmetadata:\nname: runai-half-gpu\nnamespace: &lt;KUBEFLOW-USER-NAMESPACE&gt;\nspec:\ndesc: \"Allocate 0.5 GPUs via Run:ai scheduler\"\nenv:\n- name: RUNAI_PROJECT value: \"&lt;PROJECT&gt;\"\n- name: RUNAI_GPU_FRACTION\nvalue: \"0.5\"\nselector:\nmatchLabels:\nrunai-half-gpu: \"true\"  # key must be identical to metadata.name\n</code></pre> Similar configurations can be created for fractional configurations, other than 0.5. </p>"},{"location":"admin/integration/kubeflow/#kubeflow-pipelines","title":"Kubeflow Pipelines","text":"<p>Kubeflow Pipelines is a platform for building and deploying portable, scalable machine learning (ML) workflows based on Docker containers.</p> <p>As with Kubeflow Notebooks, the goal of this section is to run pipelines jobs within the context of Run:ai.</p> <p>To create a Kubeflow pipeline, you:</p> <ul> <li>Write code using the Kubeflow Pipeline SDK. </li> <li>Package it into a single compressed file.</li> <li>Upload the file into Kubeflow and set it up.</li> </ul> <p>The example code provided here shows how to augment pipeline code to use Run:ai</p>"},{"location":"admin/integration/kubeflow/#whole-gpus_1","title":"Whole GPUs","text":"<p>To the pipeline code add:</p> <pre><code>_training = training_op()\n...\n_training.add_pod_label('runai', 'true')\n_training.add_pod_label('project', '&lt;PROJECT&gt;')\n</code></pre> <p>Where <code>&lt;Project&gt;</code> is the Run:ai project name. See example code here</p> <p>Compile the code by running:</p> <p><pre><code>dsl-compile --py kubeflow-runai-one-gpu.py --output kubeflow-runai-one-gpu.tar.gz\n</code></pre> (dsl-compile is part of the Kubeflow Pipeline Python SDK).</p>"},{"location":"admin/integration/kubeflow/#fractions_1","title":"Fractions","text":"<p>To allocate half a GPU, add the following to the pipeline code:</p> <pre><code>_training = training_op()\n...\n_training.add_pod_label('runai', 'true')\n_training.add_pod_label('project', '&lt;PROJECT&gt;')\n_training.add_pod_annotation('gpu-fraction', '0.5')\n</code></pre> <p>Where <code>&lt;Project&gt;</code> is the Run:ai project name. See example code here.</p> <p>Compile the code as described above. </p>"},{"location":"admin/integration/kubevirt/","title":"Scheduling Virtual Machines using Run:ai","text":"<p>Many organizations use virtual machines (VMs) to provide operating system abstraction to users. Containers are different than VMs but serve a similar purpose. Containers at a large scale are best managed by Kubernetes and Run:ai is based on Kubernetes. </p> <p>It is possible to mix and match containers and VMs to some extent using a technology called KubeVirt. KubeVirt allows running VMs inside containers on top of Kubernetes. </p> <p>This article describes how to use KubeVirt to schedule VMs with GPUs.</p>"},{"location":"admin/integration/kubevirt/#limitations","title":"Limitations","text":"<p>Each node in the cluster will be able to support either VMs or containers - not combined.</p> <p>GPU fractions are not supported. </p>"},{"location":"admin/integration/kubevirt/#preparations","title":"Preparations","text":"<p>Making GPUs visible to VMs is not trivial. It requires either a license for NVIDIA software called NVIDIA vGPU or creating a GPU passthrough by the explicit mapping of GPU devices to virtual machines. This guide relates to the latter option. </p>"},{"location":"admin/integration/kubevirt/#install-kubevirt","title":"Install KubeVirt","text":"<p>Install KubeVirt using the following guide.</p>"},{"location":"admin/integration/kubevirt/#dedicate-specific-nodes-for-vms","title":"Dedicate specific nodes for VMs","text":"<p>Dedicate specific nodes within the cluster to be used for VMs and not containers - following the guide.</p> <p>Specifically, restrict <code>virt-controller</code>, <code>virt-api</code> and <code>virt-handler</code> pods to only run on the nodes you want to be used for VMs.</p>"},{"location":"admin/integration/kubevirt/#assign-host-devices-to-virtual-machines","title":"Assign host devices to virtual machines","text":"<p>For each node in the cluster that we want to use with VMs we must:</p> <ul> <li>Identify all GPU cards we want to dedicate to be used by VMs.</li> <li>Map GPU cards for KubeVirt to pick up (called assigning host devices to a virtual machine).</li> </ul> <p>Instructions for identifying GPU cards are operating-system-specific. For Ubuntu 20.04 run:</p> <pre><code>lspci -nnk -d 10de:\n</code></pre> <p>Search for GPU cards that are marked with the text Kernel driver in use. Save the PCI Address, for example: 10de:1e04</p> <p>Important</p> <p>Once exposed, these GPUs cannot be used by regular pods. Only VMs. </p> <p>To expose the GPUs and map them to KubeVirt follow the instructions here. Specifically, run:</p> <pre><code>kubectl edit kubevirt -n kubevirt -o yaml\n</code></pre> <p>And add all of the PCI Addresses of all GPUs of all Nodes concatenated by commas, with the resource name kubevirt/vmgpu:</p> <pre><code>spec:\ncertificateRotateStrategy: {}\nconfiguration:\ndeveloperConfiguration:\nfeatureGates:\n- GPU\n- HostDevices\npermittedHostDevices:\npciHostDevices:\n- pciVendorSelector: &lt;PCI-ADDRESS&gt;,&lt;PCI-ADDRESS&gt;,\nresourceName: kubevirt/vmgpu\n</code></pre>"},{"location":"admin/integration/kubevirt/#assign-gpus-to-vms","title":"Assign GPUs to VMs","text":"<p>You must create a CRD called vm for each virtual machine. <code>vm</code> is a reference to a virtual machine and its capabilities.</p> <p>The Run:ai project is matched to a Kubernetes namespace. Unless manually configured, the namespace is <code>runai-&lt;PROJECT-NAME&gt;</code>. Per Run:ai Project, create a <code>vm</code> object. See KubeVirt documentation example. Specifically, the created YAML should look like this:</p> <pre><code>spec:\nrunning: false\ntemplate:\nmetadata:\ncreationTimestamp: null\nlabels:\n....\npriorityClassName: &lt;WORKLOAD-TYPE&gt;\nproject: &lt;PROJECT-NAME&gt;\nspec:\nschedulerName: runai-scheduler\ndomain:\ndevices:\ngpus:\n- deviceName: kubevirt/vmgpu # identical name to resourceName above\nname: gpu1  # name here is arbitrary and is not used. \n</code></pre> <p>Where <code>&lt;WORKLOAD-TYPE&gt;</code> is <code>train</code> or <code>build</code></p>"},{"location":"admin/integration/kubevirt/#turn-on-kubevirt-feature-in-runai","title":"Turn on KubeVirt feature in Run:ai","text":"<ul> <li> <p>If you want to upgrade the runai cluster, use the instructions. </p> <ul> <li>During the upgrade, customize the cluster installation by adding the following to the values.yaml file:</li> </ul> <pre><code>global:\nkubevirtCluster:\nenabled: true\n</code></pre> </li> <li> <p>If you don't want to upgrade the whole cluster, you can add those values to your existing values.yaml file.</p> <ul> <li>Then, run the command:</li> </ul> <pre><code>helm upgrade runai-cluster runai/runai-cluster -n runai -f values.yaml\n</code></pre> </li> <li> <p>Make sure the <code>kubevirtCluster: enabled</code> flag is still turned on in <code>runaiconfig</code>:</p> <pre><code>kubectl edit runaiconfig runai -n runai\n</code></pre> </li> </ul>"},{"location":"admin/integration/kubevirt/#start-a-vm","title":"Start a VM","text":"<p>Run:</p> <pre><code>virtctl start testvm -n runai-test\n</code></pre> <p>You can now see the VMs pod in Run:ai.</p> <pre><code>runai list -A\nNAME    STATUS   AGE  NODE         IMAGE                                   TYPE  PROJECT  USER  GPUs Allocated (Requested)  PODs Running (Pending)  SERVICE URL(S)\ntestvm  Running  0s   master-node  quay.io/kubevirt/virt-launcher:v0.47.1        test           1 (1)                       1 (0)\n</code></pre>"},{"location":"admin/integration/messaging/","title":"Run:ai Event Router","text":"<p>The Run:ai Event Router repository is a wrapper chart of kubernetes-event-exporter open source project configured specifically to trigger alerts from Run.Ai cluster.</p> <p>Note</p> <p>The Event Router currently is configured out of the box only for parsing Run:ai scheduler events to Slack.</p>"},{"location":"admin/integration/messaging/#configure-slack-notifications","title":"Configure Slack notifications","text":"<p>To configure Slack notifications:</p> <ol> <li> <p>Create a new slack app here. The slack app is used to trigger notifications to a channel and to generate auth token to the event-router.</p> <p>After pressing on the create new app button, you should have the option to create the app from an <code>app manifest</code>.</p> </li> <li> <p>Copy the following slack manifest definition in order to create <code>runai-event-router-app</code>:</p> <pre><code>display_information:\nname: runai-event-router\ndescription: This app is used by runai-event-router for sending notifications through slack\nbackground_color: \"#141f40\"\nfeatures:\nbot_user:\ndisplay_name: runai-event-router-app\nalways_online: false\noauth_config:\nscopes:\nbot:\n- chat:write\n- incoming-webhook\n- chat:write.public\nsettings:\norg_deploy_enabled: false\nsocket_mode_enabled: false\ntoken_rotation_enabled: false\n</code></pre> <p>After creating your app go to the \"OAuth &amp; Permissions\" section in your app and grab the <code>Bot User OAuth Token</code>, in the following step paste the token in the values.yaml.</p> <p>Note</p> <p>In order to create the slack app without a manifest you can follow slack docs: guide to creating Slack apps with bot tokens.</p> </li> <li> <p>Clone the Run.Ai Event Router repository and edit the <code>values.yaml</code> file.</p> <pre><code>runaiProjects: - my_runai_project\nclusterName: \"\"\nslack:\nenabled: true\napiToken: \"\"\nchannel: \"\"\n</code></pre> <p>runaiProjects\u2014projects listed here will send notifications. Use the regex pattern <code>.*</code> to find all the projects with the same starting name.</p> <p>clusterName\u2014name of the cluster to show in the Slack notifications.</p> <p>slack</p> <p><code>enabled</code>\u2014Enable slack integration</p> <p><code>apiToken</code>\u2014Slack bot token, configured with 'chat:write'   permissions. (see previous section)</p> <p><code>channel</code>\u2014A destination channel 'runai-notifications' a direct message '@bob.marly' or dynamic by setting 'pod-project'.</p> <p>If the value of <code>channel</code> is set to pod-project, the event router will try to send the notification by tagging '@project' where project is taken from the 'project' label attached to your pod by runai.</p> </li> <li> <p>After configuring the <code>values.yaml</code> file, run the following commands to deploy the chart:</p> <pre><code>chmod +x post-process.sh\nhelm install runai-event-router . -n runai-monitoring --create-namespace --post-renderer ./post-process.sh\n</code></pre> </li> </ol>"},{"location":"admin/integration/mlflow/","title":"Integrate Run:ai with MLflow","text":"<p>MLflow is an open-source platform to manage the ML lifecycle, including experimentation, reproducibility, deployment, and a central model registry. The purpose of this document is to explain how to run Jobs with MLflow using the Run:ai scheduler. </p>"},{"location":"admin/integration/mlflow/#overview","title":"Overview","text":"<p>MLflow concepts and alternative architectures are discussed here. MLflow can run on various platforms. To work with Run:ai we would use the MLflow Kubernetes integration.</p> <p>The MLflow documentation describes the Kubernetes integration as such:</p> <p>Quote</p> <p>When you run an MLflow Project on Kubernetes, MLflow constructs a new Docker image containing the Project\u2019s contents; this image inherits from the Project\u2019s Docker environment. MLflow then pushes the new Project image to your specified Docker registry and starts a Kubernetes Job on your specified Kubernetes cluster. This Kubernetes Job downloads the Project image and starts a corresponding Docker container. Finally, the container invokes your Project\u2019s entry point, logging parameters, tags, metrics, and artifacts to your MLflow tracking server.</p> <p>To run an MLflow job via Kubernetes, you specify an MLflow Kubernetes configuration file that contains a template. Here is an example from the MLflow documentation:</p> <pre><code>{\n\"kube-context\": ...,\n\"repository-uri\": ...,\n\"kube-job-template-path\": \"/username/path/to/kubernetes_job_template.yaml\"\n}\n</code></pre> <p>The essence of the Run:ai integration is the modification of the <code>kubernetes_job_template.yaml</code> file. Specifically adding the Run:ai scheduler name and the Run:ai Project (Kubernetes namespace).</p>"},{"location":"admin/integration/mlflow/#step-by-step-instructions","title":"Step by Step Instructions","text":""},{"location":"admin/integration/mlflow/#prerequisites","title":"Prerequisites","text":"<ul> <li>Install MLflow.</li> <li>Make sure you have push access to a Docker repository from your local machine.</li> <li>Make sure you are connected to Run:ai via the Run:ai Command-line interface.</li> </ul>"},{"location":"admin/integration/mlflow/#the-sample-mlflow-project","title":"The sample MLflow Project","text":"<p>The relevant sample files are here. These contain:</p> <ul> <li>A <code>Dockerfile</code>. This file builds a base docker image containing python3 and the required MLflow dependencies. The Docker file is already compiled and available at <code>gcr.io/run-ai-demo/mlflow-demo</code>.</li> <li>An MLflow project file <code>MLproject</code>. The project file contains the base image above as well as the python command-line to run. </li> <li>The training python code <code>train.py</code></li> <li>MLflow Kubernetes configuration files as in the MLflow documentation.<ul> <li>Kubernetes configuration file <code>kubernetes_config.json</code></li> <li>An MLflow Kubernetes Job template <code>kubernetes_job_template.yaml</code> </li> </ul> </li> </ul>"},{"location":"admin/integration/mlflow/#preparations","title":"Preparations","text":"<ul> <li>Edit <code>kubernetes_config.json</code>. <ul> <li>Set <code>kube-context</code> to the name of the Kubernetes context. You can find the context name by running <code>runai list clusters</code> or <code>kubectl config get-contexts</code>.</li> <li>Set <code>repository-uri</code> to a repository and name of a docker image that will be used by MLflow (this is a different image than the base docker image described above). Your local machine needs permissions to be able to push this image to the Docker registry.</li> </ul> </li> <li>Edit <code>kubernetes_job_template.yaml</code>. <ul> <li>Set the value of <code>namespace</code> to <code>runai-&lt;name of Run:ai project&gt;</code>. </li> <li>Note the last line which adds the Run:ai scheduler to the configuration. </li> <li>Do not change the lines marked by <code>{replaced with...</code>.</li> <li>Set the requested resources including GPUs. You can use the <code>--dry-run</code> flag of the runai submit command to gain insight on additional configurations</li> </ul> </li> </ul>"},{"location":"admin/integration/mlflow/#running","title":"Running","text":"<ul> <li>Perform <code>docker login</code> if required.</li> <li>Run:</li> </ul> <pre><code>mlflow run mlproject -P alpha=5.0  -P l1-ratio=0.1  \\\n    --backend kubernetes --backend-config kubernetes_config.json\n</code></pre>"},{"location":"admin/integration/mlflow/#mlflow-tracking","title":"MLflow Tracking","text":"<p>The sample training code above does not contain references to an MLflow tracking server. This has been done to simplify the required setup. With MLflow-Kubernetes you will need a remote server architecture. Once you have such an architecture set up, you can use MLflow Tracking in your code.</p>"},{"location":"admin/integration/mlflow/#using-interactive-workloads","title":"Using Interactive Workloads","text":"<p>With Run:ai you can also run interactive workloads. To run the Job as interactive, add the following to <code>kubernetes_job_template.yaml</code>: </p> <pre><code>metadata:\nlabels:\npriorityClassName: \"build\"\n</code></pre>"},{"location":"admin/integration/mlflow/#see-also","title":"See Also","text":"<ul> <li>You can use MLflow together with Fractional GPUs. For more information see Launch Job via YAML.</li> <li>To map additional Run:ai options to the YAML, see Launch Job via YAML.</li> </ul>"},{"location":"admin/integration/ray/","title":"Integrate Run:ai with Ray","text":"<p>Ray is an open-source unified framework for scaling AI and Python applications like machine learning. It provides the compute layer for parallel processing so that you don\u2019t need to be a distributed systems expert.</p>"},{"location":"admin/integration/ray/#sumitting-ray-jobs","title":"Sumitting Ray jobs","text":"<p>You must install KubeRay version 0.5.0 or greater in order to work with the different types of Ray workloads.</p> <p>Use the following commands:</p> <pre><code>helm repo add kuberay https://ray-project.github.io/kuberay-helm/\n\nhelm install kuberay-operator kuberay/kuberay-operator -n kuberay-operator --version 0.5.0 --create-namespace\n</code></pre> <p>For more information, see Deploying RayKube operator.</p>"},{"location":"admin/integration/ray/#submit-ray-jobs","title":"Submit Ray jobs","text":"<p>Run:AI integrates with ray by interacting with the kuberay CRDs (RayJob, RayServe and RayCluster). The following is an example of RayJob scheduled by Run:AI. Use the following command to submit your Ray job:</p> <p><code>kubectl apply -f &lt;path/example-file-name.yaml&gt;</code></p> <p>For more information, see Run an example job.</p> <p>Example:</p> <pre><code>apiVersion: ray.io/v1alpha1\nkind: RayJob\nmetadata:\n  name: &lt;name&gt;\n  namespace: &lt;your_project_namespace&gt;\nspec:\n  entrypoint: python /home/ray/samples/sample_code.py\n  # runtimeEnv decoded to '{\n  #    \"pip\": [\n  #        \"requests==2.26.0\",\n  #        \"pendulum==2.1.2\"\n  #    ],\n  #    \"env_vars\": {\n  #        \"counter_name\": \"test_counter\"\n  #    }\n  #}'\n  shutdownAfterJobFinishes: true\n  #ttlSecondsAfterFinished: 30\n  runtimeEnv: ewogICAgInBpcCI6IFsKICAgICAgICAicmVxdWVzdHM9PTIuMjYuMCIsCiAgICAgICAgInBlbmR1bHVtPT0yLjEuMiIKICAgIF0sCiAgICAiZW52X3ZhcnMiOiB7ImNvdW50ZXJfbmFtZSI6ICJ0ZXN0X2NvdW50ZXIifQp9Cg==\n  rayClusterSpec:\n    rayVersion: '2.3.0' # should match the Ray version in the image of the containers\n    # Ray head pod template\n    headGroupSpec:\n      serviceType: ClusterIP # optional\n      # the following params are used to complete the ray start: ray start --head --block --redis-port=6379 ...\n      rayStartParams:\n        dashboard-host: '0.0.0.0'\n        num-cpus: '2' # can be auto-completed from the limits\n        block: 'true'\n      #pod template\n      template:\n        spec:\n          containers:\n            - name: ray-head\n              image: rayproject/ray:2.3.0\n              ports:\n                - containerPort: 6379\n                  name: gcs-server\n                - containerPort: 8265 # Ray dashboard\n                  name: dashboard\n                - containerPort: 10001\n                  name: client\n                - containerPort: 8000\n                  name: serve\n              volumeMounts:\n                - mountPath: /home/ray/samples\n                  name: code-sample\n          schedulerName: runai-scheduler\n          volumes:\n            # You set volumes at the Pod level, then mount them into containers inside that Pod\n            - name: code-sample\n              configMap:\n                # Provide the name of the ConfigMap you want to mount.\n                name: ray-job-code-sample\n                # An array of keys from the ConfigMap to create as files\n                items:\n                  - key: sample_code.py\n                    path: sample_code.py\n    workerGroupSpecs:\n      # the pod replicas in this group typed worker\n      - replicas: 1 # example\n        minReplicas: 1 # example\n        maxReplicas: 5 # example\n        # logical group name, for this called small-group, also can be functional\n        groupName: small-group\n        rayStartParams:\n          block: 'true'\n        #pod template\n        template:\n          spec:\n            initContainers:\n              # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name\n              - name: init\n                image: busybox:1.28\n                command: [ 'sh', '-c', \"until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done\" ]\n            containers:\n              - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'\n                image: rayproject/ray:2.3.0\n                lifecycle:\n                  preStop:\n                    exec:\n                      command: [ \"/bin/sh\",\"-c\",\"ray stop\" ]\n            schedulerName: runai-scheduler\n######################Ray code sample#################################\n# this sample is from https://docs.ray.io/en/latest/cluster/job-submission.html#quick-start-example\n# it is mounted into the container and executed to show the Ray job at work\n---\napiVersion: v1\nkind: ConfigMap\nmetadata:\n  name: ray-job-code-sample\ndata:\n  sample_code.py: |\n    import ray\n    import os\n    import requests\n\n    ray.init()\n\n    @ray.remote\n    class Counter:\n        def __init__(self):\n            # Used to verify runtimeEnv\n            self.name = os.getenv(\"counter_name\")\n            self.counter = 0\n\n        def inc(self):\n            self.counter += 1\n\n        def get_counter(self):\n            return \"{} got {}\".format(self.name, self.counter)\n\n    counter = Counter.remote()\n\n    for _ in range(5):\n        ray.get(counter.inc.remote())\n        print(ray.get(counter.get_counter.remote()))\n\n    print(requests.__version__)\n</code></pre>"},{"location":"admin/integration/ray/#ray-autoscaling-cluster","title":"Ray autoscaling cluster","text":"<p>For more information, see Ray autoscaling.</p> <p>Use the following command to submit your Ray autoscaling cluster:</p> <p><code>kubectl apply -f &lt;path/example-file-name.yaml&gt;</code></p> <p>Example:</p> <pre><code># This config demonstrates KubeRay's Ray autoscaler integration.\n# The resource requests and limits in this config are too small for production!\n# For an example with more realistic resource configuration, see\n# ray-cluster.autoscaler.large.yaml.\napiVersion: ray.io/v1alpha1\nkind: RayCluster\nmetadata:\n  labels:\n    controller-tools.k8s.io: \"1.0\"\n    # A unique identifier for the head node and workers of this cluster.\n  name: &lt;name&gt;\n  namespace: &lt;your_project_namespace&gt;\nspec:\n  # The version of Ray you are using. Make sure all Ray containers are running this version of Ray.\n  rayVersion: '2.3.0'\n  # If enableInTreeAutoscaling is true, the autoscaler sidecar will be added to the Ray head pod.\n  # Ray autoscaler integration is supported only for Ray versions &gt;= 1.11.0\n  # Ray autoscaler integration is Beta with KubeRay &gt;= 0.3.0 and Ray &gt;= 2.0.0.\n  enableInTreeAutoscaling: true\n  # autoscalerOptions is an OPTIONAL field specifying configuration overrides for the Ray autoscaler.\n  # The example configuration shown below below represents the DEFAULT values.\n  # (You may delete autoscalerOptions if the defaults are suitable.)\n  autoscalerOptions:\n    # upscalingMode is \"Default\" or \"Aggressive.\"\n    # Conservative: Upscaling is rate-limited; the number of pending worker pods is at most the size of the Ray cluster.\n    # Default: Upscaling is not rate-limited.\n    # Aggressive: An alias for Default; upscaling is not rate-limited.\n    upscalingMode: Default\n    # idleTimeoutSeconds is the number of seconds to wait before scaling down a worker pod which is not using Ray resources.\n    idleTimeoutSeconds: 60\n    # image optionally overrides the autoscaler's container image.\n    # If instance.spec.rayVersion is at least \"2.0.0\", the autoscaler will default to the same image as\n    # the ray container. For older Ray versions, the autoscaler will default to using the Ray 2.0.0 image.\n    ## image: \"my-repo/my-custom-autoscaler-image:tag\"\n    # imagePullPolicy optionally overrides the autoscaler container's default image pull policy (IfNotPresent).\n    imagePullPolicy: IfNotPresent\n    # Optionally specify the autoscaler container's securityContext.\n    securityContext: {}\n    env: []\n    envFrom: []\n    # resources specifies optional resource request and limit overrides for the autoscaler container.\n    # The default autoscaler resource limits and requests should be sufficient for production use-cases.\n    # However, for large Ray clusters, we recommend monitoring container resource usage to determine if overriding the defaults is required.\n    resources:\n      limits:\n        cpu: \"500m\"\n        memory: \"512Mi\"\n      requests:\n        cpu: \"500m\"\n        memory: \"512Mi\"\n  # Ray head pod template\n  headGroupSpec:\n    serviceType: ClusterIP # optional\n    # the following params are used to complete the ray start: ray start --head --block ...\n    rayStartParams:\n      dashboard-host: '0.0.0.0'\n      block: 'true'\n      # num-cpus: '1' # can be auto-completed from the limits\n      # Use `resources` to optionally specify custom resource annotations for the Ray node.\n      # The value of `resources` is a string-integer mapping.\n      # Currently, `resources` must be provided in the specific format demonstrated below:\n      # resources: '\"{\\\"Custom1\\\": 1, \\\"Custom2\\\": 5}\"'\n    #pod template\n    template:\n      spec:\n        containers:\n        # The Ray head container\n        - name: ray-head\n          image: rayproject/ray:2.3.0\n          ports:\n          - containerPort: 6379\n            name: gcs\n          - containerPort: 8265\n            name: dashboard\n          - containerPort: 10001\n            name: client\n          lifecycle:\n            preStop:\n              exec:\n                command: [\"/bin/sh\",\"-c\",\"ray stop\"]\n          # The resource requests and limits in this config are too small for production!\n          # For an example with more realistic resource configuration, see\n          # ray-cluster.autoscaler.large.yaml.\n          # It is better to use a few large Ray pod than many small ones.\n          # For production, it is ideal to size each Ray pod to take up the\n          # entire Kubernetes node on which it is scheduled.\n          resources:\n            limits:\n              cpu: \"1\"\n              memory: \"2G\"\n            requests:\n              # For production use-cases, we recommend specifying integer CPU reqests and limits.\n              # We also recommend setting requests equal to limits for both CPU and memory.\n              # For this example, we use a 500m CPU request to accomodate resource-constrained local\n              # Kubernetes testing environments such as KinD and minikube.\n              cpu: \"500m\"\n              # The rest state memory usage of the Ray head node is around 1Gb. We do not\n              # recommend allocating less than 2Gb memory for the Ray head pod.\n              # For production use-cases, we recommend allocating at least 8Gb memory for each Ray container.\n              memory: \"2G\"\n        schedulerName: runai-scheduler\n  workerGroupSpecs:\n  # the pod replicas in this group typed worker\n  - replicas: 1 # example\n    minReplicas: 1 # example\n    maxReplicas: 10 # example\n    # logical group name, for this called small-group, also can be functional\n    groupName: small-group\n    # If worker pods need to be added, we can increment the replicas.\n    # If worker pods need to be removed, we decrement the replicas, and populate the workersToDelete list.\n    # The operator will remove pods from the list until the desired number of replicas is satisfied.\n    # If the difference between the current replica count and the desired replicas is greater than the\n    # number of entries in workersToDelete, random worker pods will be deleted.\n    #scaleStrategy:\n    #  workersToDelete:\n    #  - raycluster-complete-worker-small-group-bdtwh\n    #  - raycluster-complete-worker-small-group-hv457\n    #  - raycluster-complete-worker-small-group-k8tj7\n    # the following params are used to complete the ray start: ray start --block ...\n    rayStartParams:\n      block: 'true'\n    #pod template\n    template:\n      spec:\n        initContainers:\n        # the env var $FQ_RAY_IP is set by the operator if missing, with the value of the head service name\n        - name: init\n          image: busybox:1.28\n          command: ['sh', '-c', \"until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for K8s Service $RAY_IP; sleep 2; done\"]\n        containers:\n        - name: ray-worker\n          image: rayproject/ray:2.3.0\n          lifecycle:\n            preStop:\n              exec:\n                command: [\"/bin/sh\",\"-c\",\"ray stop\"]\n          # The resource requests and limits in this config are too small for production!\n          # For an example with more realistic resource configuration, see\n          # ray-cluster.autoscaler.large.yaml.\n          # It is better to use a few large Ray pod than many small ones.\n          # For production, it is ideal to size each Ray pod to take up the\n          # entire Kubernetes node on which it is scheduled.\n          resources:\n            limits:\n              cpu: \"1\"\n              memory: \"1G\"\n            # For production use-cases, we recommend specifying integer CPU reqests and limits.\n            # We also recommend setting requests equal to limits for both CPU and memory.\n            # For this example, we use a 500m CPU request to accomodate resource-constrained local\n            # Kubernetes testing environments such as KinD and minikube.\n            requests:\n              cpu: \"500m\"\n              memory: \"1G\"\n        schedulerName: runai-scheduler\n</code></pre>"},{"location":"admin/integration/seldon/","title":"Integrate Run:ai with Seldon Core","text":"<p>Seldon Core is software that deploys machine learning models to production over Kubernetes. The purpose of this document is to explain how to use Seldon Core together with Run:ai.  </p> <p>Of special importance, is the usage of Seldon together with the Run:ai fractions technology: Machine learning production tends to take less GPU Memory. As such, allocating a fraction of the GPU per job allows for better GPU Utilization. </p>"},{"location":"admin/integration/seldon/#prerequisites","title":"Prerequisites","text":"<p>Install Seldon Core as described here. We recommend using the helm-based installation of both Seldon Core and Istio.</p>"},{"location":"admin/integration/seldon/#create-a-seldon-deployment","title":"Create a Seldon deployment","text":"<p>The instructions below follow a sample machine learning model that tests the Run:ai - Seldon Core integration.  Save the following in a file named <code>&lt;FILE-NAME&gt;.yaml</code></p> <pre><code>apiVersion: machinelearning.seldon.io/v1\nkind: SeldonDeployment\nmetadata:\nname: seldon-model\nnamespace: runai-&lt;PROJECT-NAME&gt;\nspec:\nname: test-deployment\npredictors:\n- componentSpecs:\n- spec:\ncontainers:\n- name: classifier\nimage: seldonio/mock_classifier:1.5.0-dev\nresources:\nlimits:\nnvidia.com/gpu: &lt;GPUs&gt;\nschedulerName: runai-scheduler\ngraph:\nchildren: []\nendpoint:\ntype: REST\nname: classifier\ntype: MODEL\nname: example\nreplicas: 1\n</code></pre> <p>apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata:   name: seldon-model   namespace: runai- spec:   name: test-deployment   predictors:   - componentSpecs:     - spec:         containers:         - name: classifier           image: seldonio/mock_classifier:1.0           resources:             limits:               nvidia.com/gpu:          schedulerName: runai-scheduler     graph:       children: []       endpoint:         type: REST       name: classifier       type: MODEL     name: example     replicas: 1 <p>Replace <code>&lt;PROJECT-NAME&gt;</code> with the Run:ai projects and <code>&lt;GPUs&gt;</code> with the amount of GPUs you want to allocate (e.g. 0.5 GPUs).</p> <pre><code>kubectl apply -f &lt;FILE-NAME&gt;.yaml\n</code></pre>"},{"location":"admin/integration/seldon/#verification","title":"Verification","text":"<p>Run: <code>runai list jobs</code> and verify that the job is running</p>"},{"location":"admin/integration/seldon/#delete-a-deployment","title":"Delete a deployment","text":"<p>Run: </p> <pre><code>kubectl delete -f &lt;FILE-NAME&gt;.yaml\n</code></pre>"},{"location":"admin/integration/spark/","title":"Running Spark jobs with Run:AI","text":"<p>Spark has two modes for running jobs on kubernetes:</p> <ul> <li>Using a CLI tool called <code>spark-submit</code> that submits raw pods.</li> <li>CRD with operator.</li> </ul>"},{"location":"admin/integration/spark/#cli-spark-submit","title":"CLI <code>Spark-submit</code>","text":"<p>To run a Spark job on Kubernetes using the CLI:</p> <ol> <li>Download a pre-built spark with hadoop image from here.</li> <li>Open the file, then go to its root to submit the jobs.</li> </ol>"},{"location":"admin/integration/spark/#cluster-preparation","title":"Cluster preparation","text":"<p>Ensure that your Kubernetes cluster has a service account with permissions in the namespace that you want to run the jobs in. Use the following commands to launch the Spark demo:</p> <pre><code>kubectl create ns spark-demo\n\nkubectl create serviceaccount spark -n spark-demo\n\nkubectl create clusterrolebinding spark-role --clusterrole edit --serviceaccount spark-demo:spark -n spark-demo\n</code></pre> <p>Change the namespace to <code>runai-&lt;your_runai-project-name&gt;</code>.</p>"},{"location":"admin/integration/spark/#docker-images","title":"Docker Images","text":"<p>We need to build docker images and push them to either a public repository or load them to kind.</p> <p>To build the images run:</p> <pre><code>./bin/docker-image-tool.sh -t &lt;image_tag&gt; build\n</code></pre> <p>Then push the docker image to your repository:</p>"},{"location":"admin/integration/spark/#submitting-jobs","title":"Submitting jobs","text":"<p>To submit a job:</p> <ol> <li>Set the value of the API server of the kubernetes cluster you are working with in the <code>K8S\\_SERVER</code> environment variable. </li> <li>Run <code>kubectl config view</code> to search for your cluster.</li> <li>Copy the value of the server field (for example, https://127.0.0.1:46443).</li> </ol> <p>To run a simple job with the default scheduler use the following:</p> <pre><code>./bin/spark-submit --master k8s://$K8S\\_SERVER --deploy-mode cluster --name spark-pi \\\n--class org.apache.spark.examples.SparkPi \\\n--conf spark.kubernetes.namespace=spark-demo \\\n--conf spark.executor.instances=5 \\\n--conf spark.kubernetes.container.image=spark:v3.2.1 \\\n--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \\\nlocal:///opt/spark/examples/jars/spark-examples\\_2.12-3.4.0.jar 10\n</code></pre> <p>The command will first create a pod called driver\" and then it will create 5 executor (worker) pods that will do the actual work of running the job. The executor pods will have the driver as their Kubernetes owner.</p>"},{"location":"admin/integration/spark/#submitting-jobs-using-runai-scheduler","title":"Submitting jobs using <code>runai-scheduler</code>","text":"<p>To submit a job with <code>runai-scheduler</code> in project <code>&lt;project_name&gt;</code> add or change these flags:</p> <pre><code>--conf spark.kubernetes.namespace=runai-team-a \\\n--conf spark.kubernetes.scheduler.name=runai-scheduler \\\n--conf spark.kubernetes.driver.label.runai/queue=team-a \\\n--conf spark.kubernetes.executor.label.runai/queue=team-a \\\n</code></pre> <p>To schedule the executors on GPUs, add the following flags:</p> <pre><code>--conf spark.executor.resource.gpu.amount=1 \\\n--conf spark.executor.resource.gpu.vendor=nvidia.com \\\n--conf spark.executor.resource.gpu.discoveryScript=/opt/spark/examples/src/main/scripts/getGpusResources.sh \\\n</code></pre> <p>With GPU fractions add the annotaiton to the executor pods:</p> <pre><code>--conf spark.kubernetes.executor.annotation.gpu-fraction=0.5 \\\n--conf spark.executor.resource.gpu.amount=1 \\\n--conf spark.executor.resource.gpu.vendor=nvidia.com \\\n--conf spark.executor.resource.gpu.discoveryScript=/opt/spark/examples/src/main/scripts/getGpusResources.sh \\\n</code></pre>"},{"location":"admin/integration/spark/#see-also","title":"See also","text":"<p>[1] Demo: Running Spark Examples on minikube</p> <p>[2] Running Spark on Kubernetes</p>"},{"location":"admin/integration/weights-and-biases/","title":"Weights and Biases","text":"<p>Weights and Biases is a commercial tool that provides experiment tracking, model visualization, and collaboration for machine learning projects. It helps researchers and developers keep track of their experiments, visualize their results, and compare different models to make informed decisions.</p> <p>When Wights and Biases are integrated into Run:ai Workspaces, researchers can easily create their custom work environments and have access to a toolbox of many researcher-relevant tools in a single place. Researchers can now create useful connectivity between the running Workspace and the relevant project using Weights Biases for experiment tracking.</p> <p>To configure Weights and Biases:</p> <ol> <li>Login to your account in Weights and Biases. If you do not have a valid account, you will need to create one.</li> <li>Setup your Weights and Biases account here</li> <li>In your Run:ai account, create an environment and set Weights and Biases as a tool then:</li> <li>Enter the following <code>&lt;WANDB_results_URL&gt;</code></li> <li>Add an environment variable:<pre><code>```Key = WANDB_results_URL```\n\n```Value = enter the URL destination for the results```\n</code></pre> </li> </ol> <p>The researcher must then create a Workspace and select the Weights and Biases tool.</p> <p>To configure the Weights and Biases tool, for the environemnt variable name <code>WANDB_results_URL</code> value, enter the ULR of the destination where the results are to be delivered.</p> <p>This will create a link, that will automatically open a new tab directly from your Workspace to your exact Weights and Biases project.</p>"},{"location":"admin/integration/weights-and-biases/#sweep-configuration","title":"Sweep configuration","text":"<p>Sweep is Weight &amp; Biases tool for performing hyperparameters search and optimization. For more infomrmation, see Tune Hyperparameters.</p> <p>To enable the WANDB sweep feature:</p> <ol> <li>Open the settings page.</li> <li>Toggle on the wandb sweep feature and enter the base URL, then press save.</li> </ol> <p>To submit a sweep configuration for your Run:ai job:</p> <ol> <li>Open the submit job page, ans select training.</li> <li>Open the WANDB sweep section, and toggle the Sweep configuration switch.</li> <li>Enter your YAML configuration file or paste it into the editor.</li> <li>Enter the Weight &amp; Biases entity name.</li> <li>Enter the Weights &amp; Biases project name (optional).</li> <li>Enter the Weights &amp; Biases API Key.</li> <li>Enter a count (optional).</li> </ol> <p>After you have completed the job setup, press submit.</p> <p>If successful, you should see in the logs that the job is running and a connection to wandb's. Then, go to your wandb web app, open the sweeps page, then open the last sweep created, you should see data there.</p>"},{"location":"admin/researcher-setup/cli-install/","title":"Install the Run:ai Command-line Interface","text":"<p>The Run:ai Command-line Interface (CLI) is one of the ways for a Researcher to send deep learning workloads, acquire GPU-based containers, list jobs, etc.</p> <p>The instructions below will guide you through the process of installing the CLI. The Run:ai CLI runs on Mac and Linux. You can run the CLI on Windows by using Docker for Windows. See the end of this document.</p>"},{"location":"admin/researcher-setup/cli-install/#researcher-authentication","title":"Researcher Authentication","text":"<p>When enabled, Researcher authentication requires additional setup when installing the CLI. To configure authentication see Setup Project-based Researcher Access Control. Use the modified Kubernetes configuration file described in the article.</p>"},{"location":"admin/researcher-setup/cli-install/#prerequisites","title":"Prerequisites","text":"<ul> <li>When installing the command-line interface, it is worth considering future upgrades:<ul> <li>Install the CLI on a dedicated Jumpbox machine. Researchers will connect to the Jumpbox from which they can submit Run:ai commands</li> <li>Install the CLI on a shared directory that is mounted on Researchers' machines.  </li> </ul> </li> <li>A Kubernetes configuration file. </li> </ul>"},{"location":"admin/researcher-setup/cli-install/#setup","title":"Setup","text":""},{"location":"admin/researcher-setup/cli-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"<ul> <li>On the Researcher's root folder, create a directory .kube. Copy the Kubernetes configuration file into the directory. Each Researcher should have a separate copy of the configuration file. The Researcher should have write access to the configuration file as it stores user defaults. </li> <li>If you choose to locate the file at a different location than <code>~/.kube/config</code>, you must create a shell variable to point to the configuration file as follows:</li> </ul> <pre><code>export KUBECONFIG=&lt;Kubernetes-config-file&gt;\n</code></pre> <ul> <li>Test the connection by running:</li> </ul> <pre><code>kubectl get nodes\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#install-runai-cli","title":"Install Run:ai CLI","text":"<ul> <li>Go to the Run:ai user interface. On the top right select <code>Researcher Command Line Interface</code>.</li> <li>Select <code>Mac</code>, <code>Linux</code> or <code>Windows</code>. </li> <li>Download directly using the button or copy the file to run it on a remote machine</li> </ul> Mac or LinuxWindows <p>Run:</p> <pre><code>chmod +x runai\nsudo mv runai /usr/local/bin/runai\n</code></pre> <p>Rename the downloaded file to have a <code>.exe</code> extension and move the file to a folder that is a part of the <code>PATH</code>.</p> <p>Note</p> <p>An alternative way of downloading the CLI is provided under the CLI Troubleshooting section.</p> <p>To verify the installation run:</p> <pre><code>runai list jobs\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#install-command-auto-completion","title":"Install Command Auto-Completion","text":"<p>It is possible to configure your Linux/Mac shell to complete Run:ai CLI commands. This feature works on bash and zsh shells only.</p>"},{"location":"admin/researcher-setup/cli-install/#zsh","title":"Zsh","text":"<p>Edit the file <code>~/.zshrc</code>. Add the lines:</p> <pre><code>autoload -U compinit; compinit -i\nsource &lt;(runai completion zsh)\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#bash","title":"Bash","text":"<p>Install the bash-completion package:</p> <ul> <li>Mac: <code>brew install bash-completion</code></li> <li>Ubuntu/Debian: <code>sudo apt-get install bash-completion</code></li> <li>Fedora/Centos: <code>sudo yum install bash-completion</code></li> </ul> <p>Edit the file <code>~/.bashrc</code>. Add the lines:</p> <pre><code>[[ -r \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d ]] &amp;&amp; . \u201c/usr/local/etc/profile.d/bash_completion.sh\u201d\nsource &lt;(runai completion bash)\n</code></pre>"},{"location":"admin/researcher-setup/cli-install/#troubleshoot-the-cli-installation","title":"Troubleshoot the CLI Installation","text":"<p>See Troubleshooting a CLI installation</p>"},{"location":"admin/researcher-setup/cli-install/#update-the-runai-cli","title":"Update the Run:ai CLI","text":"<p>To update the CLI to the latest version perform the same install process again.</p>"},{"location":"admin/researcher-setup/cli-install/#delete-the-runai-cli","title":"Delete the Run:ai CLI","text":"<p>If you have installed using the default path, run:</p> <pre><code>sudo rm /usr/local/bin/runai\n</code></pre>"},{"location":"admin/researcher-setup/cluster-wide-pvc/","title":"Cluster wide PVCs","text":"<p> Version 2.10 and later.</p> <p>A PersistentVolumeClaim (PVC) is a request for storage by a user. It is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes.</p> <p>PVCs are namespace-specific. If your PVC relates to all run:ai Projects, do the following to propagate the PVC to all Projects:</p> <p>Create a PVC within the run:ai namespace, then run the following once to propagate the PVC to all run:ai Projects:</p> <pre><code>kubectl label persistentvolumeclaims -n runai &lt;PVC_NAME&gt; runai/cluster-wide=true\n</code></pre> <p>To delete a PVC from all run:ai Projects, run:</p> <pre><code>kubectl label persistentvolumeclaims -n runai &lt;PVC_NAME&gt; runai/cluster-wide-\n</code></pre> <p>You can add a PVC to a job using the <code>New job</code> form.</p> <p>To add a PVC to a new job:</p> <ol> <li>On the <code>New job</code> form, press <code>Storage</code>.</li> <li>In <code>Persistent Volume Claims</code> press <code>Add</code>.</li> <li>Enable <code>Existing PVC</code>.</li> <li>Enter the name (claim name) of the PVC.</li> <li>Enter the storage class. (Optional)</li> <li>Enter the size.</li> <li>Enable / disable access modes.</li> </ol>"},{"location":"admin/researcher-setup/docker-registry-config/","title":"Using a Docker Registry with Credentials","text":""},{"location":"admin/researcher-setup/docker-registry-config/#why","title":"Why?","text":"<p>Some Docker images are stored in private docker registries. For the Researcher to access the images, we will need to provide credentials for the registry.</p>"},{"location":"admin/researcher-setup/docker-registry-config/#how","title":"How?","text":"<p>There could be two business scenarios:</p> <ol> <li>All researchers use single credentials for the registry. </li> <li>There exist separate registry credentials per Run:ai Project. </li> </ol>"},{"location":"admin/researcher-setup/docker-registry-config/#single-credentials","title":"Single Credentials","text":"<p>For each private registry you must perform the following (The example below uses Docker Hub):</p> <pre><code>kubectl create secret docker-registry &lt;secret_name&gt; -n runai \\ \n    --docker-server=https://index.docker.io/v1/ \\\n    --docker-username=&lt;user_name&gt; --docker-password=&lt;password&gt;\n</code></pre> <p>Then:</p> <pre><code>kubectl label secret &lt;secret_name&gt; runai/cluster-wide=\"true\" -n runai\n</code></pre> <ul> <li><code>&lt;secret_name&gt;</code> may be any arbitrary string</li> <li><code>&lt;user_name&gt;</code> and <code>&lt;password&gt;</code> are the repository user and password</li> </ul> <p>Notes<ul> <li>The secret may take up to a minute to update in the system.</li> <li>The above scheme relies on the cluster setting <code>clusterWideSecret</code> to be set to <code>true</code></li> </ul> </p>"},{"location":"admin/researcher-setup/docker-registry-config/#credentials-per-project","title":"Credentials per Project","text":"<p>For each Run:ai Project create a secret:</p> <pre><code>kubectl create secret docker-registry &lt;secret_name&gt; -n &lt;NAMESPACE&gt; \\ \n    --docker-server=https://index.docker.io/v1/ \\\n    --docker-username=&lt;user_name&gt; --docker-password=&lt;password&gt;\n</code></pre> <p>Where <code>&lt;NAMESPACE&gt;</code> is the namespace associated with the Project (typically its <code>runai-&lt;PROJECT-NAME&gt;</code>).</p> <p>Then apply the secret to Run:ai by running:</p> <pre><code>kubectl patch serviceaccount default -n &lt;NAMESPACE&gt; -p '{\"imagePullSecrets\": [{\"name\": \"&lt;secret_name&gt;\"}]}'\n</code></pre>"},{"location":"admin/researcher-setup/docker-registry-config/#google-cloud-registry","title":"Google Cloud Registry","text":"<p>Follow the steps below to access private images in the Google Container Registry (GCR):</p> <ul> <li>Create a service-account in GCP. Provide it <code>Viewer</code> permissions and download a JSON key.</li> <li>Under GCR, go to image and locate the domain name. Example GCR domains can be <code>gcr.io</code>, <code>eu.gcr.io</code> etc. </li> <li> <p>On your local machine, log in to docker with the new credentials: <pre><code>docker login -u _json_key -p \"$(cat &lt;config.json&gt;)\" &lt;gcr-domain&gt;\n</code></pre>  Where <code>&lt;gcr-domain&gt;</code> is the GCR domain we have located, <code>&lt;config.json&gt;</code> is the GCP configuration file. This will generate an entry for the GCR domain in your  <code>~/.docker/config.json file</code>.</p> </li> <li> <p>Open the <code>~/.docker/config.json</code> file.  Copy the JSON structure under the GCR domain into a new file called <code>~/docker-config.json</code>. When doing so, take care to remove all newlines. For example: <pre><code>{\"https://eu.gcr.io\": { \"auth\": \"&lt;key&gt;\"}}\n</code></pre></p> </li> <li> <p>Convert the file into base64: <pre><code>cat ~/docker-config.json | base64\n</code></pre></p> </li> <li>Create a new file called <code>secret.yaml</code>:</li> </ul> <pre><code>apiVersion: v1\nkind: Secret\nmetadata:\nname: gcr-secret\nnamespace: runai\nlabels:\nrunai/cluster-wide: \"true\"\ndata:\n.dockerconfigjson: &lt;&lt; PASTE_HERE_THE_LONG_BASE64_ENCODED_STRING &gt;&gt;\ntype: kubernetes.io/dockerconfigjson\n</code></pre> <ul> <li>Apply to Kubernetes by running  the command: <pre><code>kubectl create -f ~/secret.yaml\n</code></pre></li> <li>Test your settings by submitting a which references an image from the GCR repository</li> </ul>"},{"location":"admin/researcher-setup/docker-to-runai/","title":"From Docker to Run:ai","text":""},{"location":"admin/researcher-setup/docker-to-runai/#dockers-images-and-kubernetes","title":"Dockers, Images, and Kubernetes","text":"<p>Researchers are typically proficient in working with Docker. Docker is an isolation level above the operating system which allows creating your own bundle of the operating system + deep learning environment and packaging it within a single file. The file is called a docker image.</p> <p>You create a container by starting a docker image on a machine.</p> <p>Run:ai is based on Kubernetes. At its core, Kubernetes is an orchestration software above Docker: Among other things, it allows location abstraction as to where the actual container is running. This calls for some adaptation to the Researcher's workflow as follows.</p>"},{"location":"admin/researcher-setup/docker-to-runai/#image-repository","title":"Image Repository","text":"<p>If your Kubernetes cluster contains a single GPU node (machine), then your image can reside on the node itself (in which case, when runai submit workloads, the Researcher must use the flag <code>--local-image</code>).</p> <p>If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the image can no longer reside on the node itself.  It must be relocated to an image repository. There are quite a few repository-as-a-service, most notably Docker hub. Alternatively, the organization can install a private repository on-prem.</p> <p>Day-to-day work with the image located remotely is almost identical to local work. The image name now contains its location. For example, <code>nvcr.io/nvidia/pytorch:19.12-py_3</code> is a PyTorch image that is located in nvcr.io. This is the Nvidia image repository as found on the web. </p>"},{"location":"admin/researcher-setup/docker-to-runai/#data","title":"Data","text":"<p>Deep learning is about data. It can be your code, the training data, saved checkpoints, etc.</p> <p>If your Kubernetes cluster contains a single GPU node (machine), then your data can reside on the node itself.</p> <p>If your Kubernetes cluster contains more than a single node, then, to enable location abstraction, the data must sit outside the machine, typically on network storage. The storage must be uniformly mapped to your container when it starts (using the -v command).</p>"},{"location":"admin/researcher-setup/docker-to-runai/#working-with-containers","title":"Working with Containers","text":"<p>Starting a container using docker usually involves a single command-line with multiple flags. A typical example: </p> <pre><code>docker run --runtime=nvidia --shm-size 16G -it --rm -e HOSTNAME='hostname' \\\n    -v /raid/public/my_datasets:/root/dataset:ro   -i  nvcr.io/nvidia/pytorch:19.12-py3\n</code></pre> <p>The docker command <code>docker run</code> should be replaced with a Run:ai command <code>runai submit</code>. The flags are usually the same but some adaptation is required. A complete list of flags can be found here: runai submit. </p> <p>There are similar commands to get a shell into the container (runai bash), get the container logs (runai logs), and more. For a complete list see the Run:ai CLI reference. </p>"},{"location":"admin/researcher-setup/docker-to-runai/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"<p>It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline  Researchers' work as well as save money for the organization.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/","title":"Group Nodes","text":""},{"location":"admin/researcher-setup/limit-to-node-group/#why","title":"Why?","text":"<p>In some business scenarios, you may want to direct the Run:ai scheduler to schedule a Workload to a specific node or a node group. For example, in some academic institutions, Hardware is bought using a specific grant and thus \"belongs\" to a specific research group. Another example is an inference workload that is optimized to a specific GPU type and must have dedicated resources reserved to ensure enough capacity.</p> <p>Run:ai provides two methods to designate, and group, specific resources:</p> <ul> <li>Node Pools: Run:ai allows administrators to group specific nodes into a node pool. A node pool is a group of nodes identified by a given name (node pool name) and grouped by any label (key and value combination). The label can be chosen by the administrator or can be an existing, pre-set, label (such as an NVIDIA GPU type label).</li> <li>Node Affinity: Run:ai allows this \"taint\" by labeling a node, or a set of nodes and then during scheduling, using the flag <code>--node-type &lt;label&gt;</code> to force this allocation.</li> </ul> <p>Important</p> <p>One can set and use both node pool and node affinity combined as a prerequisite to the scheduler, for example, if a researcher wants to use a T4 node with an Infiniband card - he or she can use a node pool of T4 and from that group, choose only the nodes with Infiniband card (node-type = infiniband).</p> <p>There is a tradeoff in place when allowing Researchers to designate specific nodes. Overuse of this feature limits the scheduler in finding an optimal resource and thus reduces overall cluster utilization.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#configuring-node-groups","title":"Configuring Node Groups","text":"<p>To configure a node pool:</p> <ul> <li>Find the label key &amp; value you want to use for Run:ai to create the node pool.</li> <li>Check that the nodes you want to group as a pool have a unique label to use, otherwise you should mark those nodes with your own uniquely identifiable label.</li> <li>Get the names of the nodes you want Run:ai to group together. To get a list of nodes, run:</li> </ul> <pre><code>kubectl get nodes\nKubectl get nodes --show-labels\n</code></pre> <ul> <li>If you chose to set your own label, run the following:</li> </ul> <pre><code>kubectl label node &lt;node-name&gt; &lt;label-key&gt;=&lt;label-value&gt;\n</code></pre> <p>The same value can be set to a single node or multiple nodes. Node Pool can only use one label (key &amp; value) at a time.</p> <ul> <li>To create a node pool use the create node pool Run:ai API.</li> </ul> <p>To configure a node affinity:</p> <ul> <li>Get the names of the nodes where you want to limit Run:ai. To get a list of nodes, run:</li> </ul> <pre><code>kubectl get nodes\n</code></pre> <ul> <li>For each node run the following:</li> </ul> <pre><code>kubectl label node &lt;node-name&gt; run.ai/type=&lt;label&gt;\n</code></pre> <p>The same value can be set to a single node, or for multiple nodes. A node can only be set with a single value.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#using-node-groups-via-the-cli","title":"Using Node Groups via the CLI","text":"<p>To use Run:ai node pool with a workload, use Run:ai CLI command \u2018node-pool\u2019: </p> <pre><code>runai submit job1 ... --node-pools \"my-pool\" ...\n</code></pre> <p>To use multiple node pools with a workload, use the Run:ai CLI command:</p> <pre><code>runai submit job1 ... --node-pools \"my-pool my-pool2 my-pool3\" ...\n</code></pre> <p>With multiple node pools, the researcher creates a list of prioritized node pools and lets the scheduler try and choose from any of the node pools in the list, according to the given priority. </p> <p>To use node affinity, use the node type label with the <code>--node-type</code> flag:</p> <pre><code>runai submit job1 ... --node-type \"my-nodes\"\n</code></pre> <p>A researcher may combine the two flags to select both a node pool and a specific set of nodes out of that node pool (e.g. gpu-type=t4 and node-type=infiniband):</p> <pre><code>runai submit job1 ... --node-pool-name \u201cmy pool\u201d --node-type \"my-nodes\"\n</code></pre> <p>Note</p> <p>When submitting a workload, if you choose a node pool label and a node affinity (node type) label which does not intersect, the Run:ai scheduler will not be able to schedule that workload as it represents an empty nodes group.</p> <p>See the runai submit documentation for further information.</p>"},{"location":"admin/researcher-setup/limit-to-node-group/#assigning-node-groups-to-a-project","title":"Assigning Node Groups to a Project","text":"<p>Node Pools are automatically assigned to all Projects and Departments with zero resource allocation as default. Allocating resources to a node pool can be done for each Project and Department. Submitting a workload to a node pool that has zero allocation for a specific project (or department) results in that workload running as an over-quota workload.</p> <p>To assign and configure specific node affinity groups or node pools to a Project see working with Projects.</p> <p>When the command-line interface flag is used in conjunction with Project-based affinity, the flag is used to refine the list of allowable node groups set in the Project.</p>"},{"location":"admin/researcher-setup/registry-integration/","title":"Registry integration (alpha feature)","text":"<p>run:ai now provides the ability to integrate container registry images into jobs and workspaces. Enabling this features provides you with a selection of container images from a pre-configured registry.</p>"},{"location":"admin/researcher-setup/registry-integration/#configure-the-registry","title":"Configure the registry","text":"<p>To configure the registry:</p> <ol> <li>Press <code>Settings | General</code>.</li> <li>Enable <code>Enable registry integration</code>.</li> <li>Enter the URL of the registry.</li> <li>Enter the UserId and password.</li> </ol> <p>Note</p> <p>You can configure only one container registry.</p> <p>Once you have configured the container registry, container images and tags will be available to add to jobs.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/","title":"Researcher Setup Overview","text":"<p>Following is a step-by-step guide for getting a new Researcher up to speed with Run:ai and Kubernetes.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#change-of-paradigms-from-docker-to-kubernetes","title":"Change of Paradigms: from Docker to Kubernetes","text":"<p>As part of Run:ai, the organization is typically moving from Docker-based workflows to Kubernetes. This document is an attempt to help the Researcher with this paradigm shift. It explains the basic concepts and provides links for further information about the Run:ai CLI.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#setup-the-runai-command-line-interface","title":"Setup the Run:ai Command-Line Interface","text":"<p>Run:ai CLI needs to be installed on the Researcher's machine. This document provides step by step instructions.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-the-researcher-with-a-gpu-quota","title":"Provide the Researcher with a GPU Quota","text":"<p>To submit workloads with Run:ai, the Researcher must be provided with a Project that contains a GPU quota. Please see Working with Projects document on how to create Projects and set a quota.</p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#provide-access-to-the-runai-user-interface","title":"Provide access to the Run:ai User Interface","text":"<p>See Setting up users for further information on how to provide access to users.  </p>"},{"location":"admin/researcher-setup/researcher-setup-intro/#schedule-an-onboarding-session","title":"Schedule an Onboarding Session","text":"<p>It is highly recommended to schedule an onboarding session for Researchers with a Run:ai customer success professional. Run:ai can help with the above transition, but adding to that, we at Run:ai have also acquired a large body of knowledge on data science best practices which can help streamline the Researchers' work as well as save money for the organization. </p>"},{"location":"admin/runai-setup/installation-types/","title":"Installation Types","text":"<p>Run:ai consists of two components:</p> <ul> <li>The Run:ai Cluster. One or more data-science GPU clusters hosted by the customer (on-prem or cloud).</li> <li>The Run:ai Control plane. A single entity that monitors clusters, sets priorities, and business policies. </li> </ul> <p>There are two main installation options:</p> Installation Type Description Classic (SaaS) Run:ai is installed on the customer's data science GPU clusters. The cluster connects to the Run:ai control plane on the cloud (https://<code>&lt;tenant-name&gt;</code>.run.ai).  With this installation, the cluster requires an outbound connection to the Run:ai cloud. Self-hosted The Run:ai control plane is also installed in the customer's data center <p>The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns. The self-hosted installation is priced differently. For further information please talk to Run:ai sales. </p>"},{"location":"admin/runai-setup/installation-types/#self-hosted-installation","title":"Self-hosted Installation","text":"<p>Run:ai self-hosting comes with two variants:</p> Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet"},{"location":"admin/runai-setup/installation-types/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"<p>Kubernetes has many Certified Kubernetes Providers. Run:ai has been certified with several of them (see the Kubernetes prerequisites section). The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:</p> <ul> <li>OpenShift-based installation. See Run:ai OpenShift installation.</li> <li>Kubernetes-based installation. See Run:ai Kubernetes installation.</li> </ul>"},{"location":"admin/runai-setup/installation-types/#secure-installation","title":"Secure Installation","text":"<p>In many organizations, Kubernetes is governed by IT compliance rules. In this scenario, there are strict access control rules during the installation and running of workloads:</p> <ul> <li>OpenShift is secured using Security Context Constraints (SCC). The Run:ai installation supports SCC.</li> <li>Kubernetes Pod Security Policy (PSP) has been deprecated by Kubernetes. Support for PSP will be removed on Run:ai versions higher than 2.8.</li> </ul>"},{"location":"admin/runai-setup/try-azure/","title":"Try Run:ai on Azure Cloud","text":"<p>You can try Run:ai by starting a virtual machine on Azure. This option is currently limited to a single GPU node. To install a cluster with multiple nodes or for running a formal pilot with Run:ai, use Cluster Installation.</p>"},{"location":"admin/runai-setup/try-azure/#prerequisites","title":"Prerequisites","text":"<p>You will need:</p> <ul> <li>An account in Azure with a quota for GPUs. Run:ai will work with any modern GPU.</li> <li>Tenant credentials and data, provided by Run:ai customer support. </li> </ul>"},{"location":"admin/runai-setup/try-azure/#create-an-instance-in-azure","title":"Create an instance in Azure","text":"<ul> <li>Go to Run:ai Quickstart in the Azure marketplace.  </li> <li>Press the \"Create\" button. </li> <li>Select a name, subscription, and machine size with GPUs. The machine should have at least 8 CPUs. </li> <li>Under the <code>Advanced</code> tab select <code>Enable user data</code>. Paste the user data provided by Run:ai customer support. It should be in the format: <pre><code>export RUNAI_TENANT=&lt;tenant-name&gt;\nexport RUNAI_CLIENTID=&lt;client-id&gt;\nexport RUNAI_SECRET=&lt;secret&gt;\n</code></pre></li> <li>Create the machine.</li> </ul>"},{"location":"admin/runai-setup/try-azure/#use-runai","title":"Use Run:ai","text":"<p>Go to <code>https://&lt;tenant-name&gt;.run.ai</code>. Use credentials provided by Run:ai support.</p> <p>After ~30 minutes you should have a working Run:ai cluster. You can submit Jobs via the user interface. Command-line is not provided.  </p>"},{"location":"admin/runai-setup/try-azure/#limitations","title":"Limitations","text":"<p>This setup does not support single-sign-on.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/","title":"Overview","text":""},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-overview","title":"Authentication Overview","text":"<p>To access Run:ai resources, you have to authenticate. The purpose of this document is to explain how authentication works at Run:ai.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-endpoints","title":"Authentication Endpoints","text":"<p>Generally speaking, there are two authentication endpoints:</p> <ul> <li>The Run:ai control plane.</li> <li>Run:ai GPU clusters.</li> </ul> <p>Both endpoints are accessible via APIs as well as a user interface. </p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#identity-service","title":"Identity Service","text":"<p>Run:ai includes an internal identity service. The identity service ensures users are who they claim to be and gives them the right kind of access to Run:ai.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#users","title":"Users","text":"<p>Out of the box, The Run:ai identity service provides a way to create users and associate them with access roles. </p> <p>It is also possible to configure the Run:ai identity service to connect to a company directory using the SAML protocol. For more information see single sign-on.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-method","title":"Authentication Method","text":"<p>Both endpoints described above are protected via time-limited oauth2-like JWT authentication tokens.</p> <p>There are two ways of getting a token:</p> <ul> <li>Using a user/password combination.</li> <li>Using client applications for API access.</li> </ul>"},{"location":"admin/runai-setup/authentication/authentication-overview/#authentication-flows","title":"Authentication Flows","text":""},{"location":"admin/runai-setup/authentication/authentication-overview/#runai-control-plane","title":"Run:ai control plane","text":"<p>You can use the Run:ai user interface to provide user/password. These are validated against the identity service. Run:ai will return a token with the right access rights for continued operation. </p> <p>You can also use a client application to get a token and then connect directly to the administration API endpoint. </p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#runai-gpu-clusters","title":"Run:ai GPU Clusters","text":"<p>The Run:ai GPU cluster is a Kubernetes cluster. All communication into Kubernetes flows through the Kubernetes API server.</p> <p>To facilitate authentication via Run:ai the Kubernetes API server must be configured to use the Run:ai identity service to validate authentication tokens. For more information on how to configure the Kubernetes API server see Kubernetes configuration under researcher authentication.</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#inactivity-timeout","title":"Inactivity timeout","text":"<p> Version 2.10 and later.</p> <p>Run:ai session should timeout after 1 hour of inactivity.</p> <p>Note</p> <p>Timeout settings are configured in minutes.</p> <p>To configure the inactivity timeout: 1. Open <code>Settings | General</code>. 2. Set the inactivity timeout in minutes. (Default is 60)</p>"},{"location":"admin/runai-setup/authentication/authentication-overview/#see-also","title":"See also","text":"<ul> <li>To configure authentication for researchers researcher authentication.</li> <li>To configure single sign-on, see single sign-on.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/","title":"Setup Researcher Access Control","text":""},{"location":"admin/runai-setup/authentication/researcher-authentication/#introduction","title":"Introduction","text":"<p>The following instructions explain how to complete the configuration of access control for Researchers. Run:ai access control is at the Project level. When you assign Users to Projects - only these users are allowed to submit Jobs and access Jobs details. </p> <p>This requires several steps:</p> <ul> <li>Assign users to their Projects.</li> <li>(Mandatory) Modify the Kubernetes entry point (called the <code>Kubernetes API server</code>) to validate credentials of incoming requests against the Run:ai Authentication authority.</li> <li>(Command-line Interface usage only) Modify the Kubernetes profile to prompt the Researcher for credentials when running <code>runai login</code> (or <code>oc login</code> for OpenShift). </li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#administration-user-interface-setup","title":"Administration User Interface Setup","text":""},{"location":"admin/runai-setup/authentication/researcher-authentication/#assign-users-to-projects","title":"Assign Users to Projects","text":"<p>Assign Researchers to Projects:</p> <ul> <li>Open the Run:ai user interface and navigate to <code>Users</code>. Add a Researcher and assign it a <code>Researcher</code> role.</li> <li>Navigate to <code>Projects</code>. Edit or create a Project. Use the <code>Access Control</code> tab to assign the Researcher to the Project. </li> <li>If you are using Single Sign-On, you can also assign Groups. For more information see the Single Sign-On documentation.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#mandatory-kubernetes-configuration","title":"(Mandatory) Kubernetes Configuration","text":"<p>As described in authentication overview, you must direct the Kubernetes API server to authenticate via Run:ai. This requires adding flags to the Kubernetes API Server. The flags show in the Run:ai user interface under <code>Settings</code> | <code>General</code> | <code>server configuration</code>.</p> <p>Modifying the API Server configuration differs between Kubernetes distributions:</p> Native KubernetesOpenShiftRKERKE2GKEEKSBrightAKSOther <ul> <li>Locate the Kubernetes API Server configuration file. The file's location may differ between different Kubernetes distributions. The location for vanilla Kubernetes is <code>/etc/kubernetes/manifests/kube-apiserver.yaml</code></li> <li>Edit the document, under the <code>command</code> tag, add the server configuration text from <code>Settings | General | Researcher Authentication</code>.   </li> <li>Verify that the <code>kube-apiserver-&lt;master-node-name&gt;</code> pod in the <code>kube-system</code> namespace has been restarted and that changes have been incorporated. Run the below and verify that the oidc flags you have added:</li> </ul> <pre><code>kubectl get pods -n kube-system kube-apiserver-&lt;master-node-name&gt; -o yaml\n</code></pre> <p>No configuration is needed. Instead, Run:ai assumes that an Identity Provider has been defined at the OpenShift level and that the Run:ai Cluster installation has set the <code>OpenshiftIdp</code> flag to true. For more information see the Run:ai OpenShift control-plane setup.</p> <p>Edit Rancher <code>cluster.yml</code> (with Rancher UI, follow this). Add the following:</p> cluster.yml<pre><code>kube-api:\nalways_pull_images: false\nextra_args:\noidc-client-id: runai  # (1)\noidc-issuer-url: https://example.com/auth\noidc-username-prefix: \"-\"\n</code></pre> <ol> <li>These are example parameters. Copy the actual parameters from <code>Settings | General | Researcher Authentication</code> as described above.</li> </ol> <p>You can verify that the flags have been incorporated into the RKE cluster by following the instructions here and running <code>docker inspect &lt;kube-api-server-container-id&gt;</code>, where <code>&lt;kube-api-server-container-id&gt;</code> is the container ID of api-server via obtained in the Rancher document. </p> <p>If working via the RKE2 Quickstart, edit <code>/etc/rancher/rke2/config.yaml</code>. Add the parameters provided in the server configuration section as described above in the following fashion:</p> /etc/rancher/rke2/config.yaml<pre><code>kube-apiserver-arg:\n- \"oidc-client-id=&lt;CLIENT-ID&gt;\"\n- \"oidc-issuer-url=&lt;URL&gt;\"\n- \"oidc-username-prefix=-\"\n</code></pre> <p>If working via Rancher UI, need to add the flag as part of the cluster provisioning. </p> <p>Under <code>Cluster Management | Create</code>, turn on RKE2 and select a platform. Under <code>Cluster Configuration | Advanced | Additional API Server Args</code>. Add the Run:ai flags as <code>&lt;key&gt;=&lt;value&gt;</code> (e.g. <code>oidc-username-prefix=-</code>).</p> <p>At the time of writing, the flags cannot be changed after the cluster has been provisioned due to a Rancher bug.</p> <p>Install Anthos identity service by running:</p> <pre><code>gcloud container clusters update &lt;gke-cluster-name&gt; \\\n    --enable-identity-service --project=&lt;gcp-project-name&gt; --zone=&lt;gcp-zone-name&gt;\n</code></pre> <p>Install the yq utility and run:</p> <pre><code>kubectl get clientconfig default -n kube-public -o yaml &gt; login-config.yaml\nyq -i e \".spec +={\\\"authentication\\\":[{\\\"name\\\":\\\"oidc\\\",\\\"oidc\\\":{\\\"clientID\\\":\\\"$OIDC_CLIENT_ID\\\",\\\"issuerURI\\\":\\\"$OIDC_ISSUER_URL\\\",\\\"kubectlRedirectURI\\\":\\\"http://localhost:8000/callback\\\",\\\"userClaim\\\":\\\"sub\\\",\\\"userPrefix\\\":\\\"$OIDC_USERNAME_PREFIX\\\"}}]}\" login-config.yaml\nkubectl apply -f login-config.yaml\n</code></pre> <p>Where the <code>OIDC</code> flags are provided in the Run:ai server configuration section as described above. </p> <p>To create a kubeconfig profile for Researchers run:</p> <pre><code>kubectl oidc login --cluster=CLUSTER_NAME --login-config=login-config.yaml \\\n    --kubeconfig=developer-kubeconfig\n</code></pre> <p>(this will require installing the kubectl oidc plug-in as described in the Anthos document above <code>gcloud components install kubectl-oidc</code>)</p> <p>Then modify the <code>developer-kubeconfig</code> file as described in the Command-line Inteface Access section below.</p> <ul> <li>In the AWS Console, under EKS, find your cluster.</li> <li>Go to <code>Configuration</code> and then to <code>Authentication</code>.</li> <li>Associate a new <code>identity provider</code>. Use the parameters provided in the server configuration section as described above. The process can take up to 30 minutes. </li> </ul> <p>Run the following. Replace <code>&lt;TENANT-NAME&gt;</code> and <code>&lt;REALM-NAME&gt;</code> with the appropriate values:</p> <pre><code># start cmsh\n[root@headnode ~]# cmsh\n# go to the configurationoverlay submode\n[headnode]% configurationoverlay\n\n[headnode-&gt;configurationoverlay]% list  # use list here to list overlays\n...\n\n# go to the overlay for kube master nodes\n[headnode-&gt;configurationoverlay]% use kube-default-master\n\n[headnode-&gt;configurationoverlay[kube-default-master]]% show  # use show here to show the selected overlay\n...\n\n# go to the kube apiserver role\n[headnode-&gt;configurationoverlay[kube-default-master]]% roles\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles]% list   # ... \n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles]% use kubernetes::apiserver\n\n# we can check the current value of \"options\"\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% show  # ...\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% get options\n--anonymous-auth=false\n--service-account-issuer=https://kubernetes.default.svc.cluster.local\n--service-account-signing-key-file=/cm/local/apps/kubernetes/var/etc/sa-default.key\n--feature-gates=LegacyServiceAccountTokenNoAutoGeneration=false\n# we can append our flags like this\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% append options \"--oidc-client-id=runai\"\n[headnode-&gt;configurationoverlay*[kube-default-master*]-&gt;roles*[Kubernetes::ApiServer*]]% append options \"--oidc-issuer-url=https://app.run.ai/auth/realms/&lt;REALM-NAME&gt;\"\n[headnode-&gt;configurationoverlay*[kube-default-master*]-&gt;roles*[Kubernetes::ApiServer*]]% append options \"--oidc-username-prefix=-\"\n# commit the changes\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% ]]% commit\n\n# view updated list of options\n[headnode-&gt;configurationoverlay[kube-default-master]-&gt;roles[Kubernetes::ApiServer]]% get options\n--anonymous-auth=false\n--service-account-issuer=https://kubernetes.default.svc.cluster.local\n--service-account-signing-key-file=/cm/local/apps/kubernetes/var/etc/sa-default.key\n--feature-gates=LegacyServiceAccountTokenNoAutoGeneration=false\n--cors-allowed-origins=[\\\"https://&lt;TENANT-NAME&gt;.run.ai\\\"]\n--oidc-client-id=runai\n--oidc-issuer-url=https://app.run.ai/auth/realms/&lt;REALM-NAME&gt;\n--oidc-username-prefix=-\n</code></pre> <p>All nodes with the <code>kube api server</code> role will automatically restart with the new flag.</p> <p>Please contact Run:ai customer support.</p> <p>See specific instructions in the documentation of the Kubernetes distribution.  </p>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#command-line-interface-access","title":"Command-line Interface Access","text":"<p>To control access to Run:ai (and Kubernetes) resources, you must modify the Kubernetes configuration file. The file is distributed to users as part of the Command-line interface installation. </p> <p>When making changes to the file, keep a copy of the original file to be used for cluster administration. After making the modifications, distribute the modified file to Researchers. </p> <ul> <li>Under the <code>~/.kube</code> directory edit the <code>config</code> file, remove the administrative user, and replace it with text from <code>Settings | General | Researcher Authentication</code> | <code>Client Configuration</code>. </li> <li>Under <code>contexts | context | user</code> change the user to <code>runai-authenticated-user</code>.</li> </ul>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#test-via-command-line-interface","title":"Test via Command-line interface","text":"<ul> <li>Run: <code>runai login</code> (in OpenShift environments use <code>oc login</code> rather than <code>runai login</code>).</li> <li>You will be prompted for a username and password. In a single sign-on flow, you will be asked to copy a link to a browser, log in and return a code. </li> <li>Once login is successful, submit a Job.</li> <li>If the Job was submitted with a Project to which you have no access, your access will be denied. </li> <li>If the Job was submitted with a Project to which you have access, your access will be granted.</li> </ul> <p>You can also submit a Job from the Run:ai User interface and verify that the new job shows on the job list with your user name. </p>"},{"location":"admin/runai-setup/authentication/researcher-authentication/#test-via-user-interface","title":"Test via User Interface","text":"<ul> <li>Open the Run:ai user interface, go to <code>Jobs</code>.</li> <li>On the top-right, select <code>Submit Job</code>. </li> </ul> <p>Tip</p> <p>If you do not see the button or it is disabled, then you either do not have <code>Researcher</code> access or the cluster has not been set up correctly. For more information, refer to user interface overview.</p>"},{"location":"admin/runai-setup/authentication/sso/","title":"Single Sign-On","text":"<p>Single Sign-On (SSO) is an authentication scheme that allows a user to log in with a single ID to other, independent, software systems. SSO solves security issues involving multiple user/password data entries, multiple compliance schemes, etc.</p> <p>Run:ai supports SSO using the SAML 2.0 protocol and Open ID Connect (OIDC).</p> <p>Caution</p> <p>Single sign-on is only available with SaaS installations where the tenant has been created post-January 2022 or any Self-hosted installation of release 2.0.58 or later. If you are using single sign-on with older versions of Run:ai, please contact Run:ai customer support</p>"},{"location":"admin/runai-setup/authentication/sso/#terminology","title":"Terminology","text":"<p>Identity Provider (Idp)\u2014 a system that creates, maintains, and manages identity information. Example IdPs: Google, Keycloak, Salesforce, Auth0.</p>"},{"location":"admin/runai-setup/authentication/sso/#saml-prerequisites","title":"SAML Prerequisites","text":"<ul> <li>XML Metadata\u2014you must have an XML Metadata file retrieved from your IdP. Upload the file to a web server such that you will have a URL to the file. The URL must have the XML file extension. For example, to connect using Google, you must create a custom SAML App here, download the Metadata file, and upload it to a web server.</li> <li>Organization Name\u2014you must have a Run:ai Organization Name. This is the name that appears on the top right of the Run:ai user interface.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#oidc-prerequisites","title":"OIDC Prerequisites","text":"<ul> <li>Discovery URL\u2014the OpenID server where the content discovery information is published.</li> <li>ClientID\u2014the ID used to identify the client with the Authorization Server.</li> <li>Client Secret\u2014a secret password that only the Client and Authorization Server know.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#additional-attribute-mappings","title":"Additional attribute mappings","text":"<p>You can configure your IdP to map several IdP attributes:</p> IdP attribute Run:ai required name Description User email email (Mandatory) <code>e-mail</code> is the user identifier with Run:ai. User role groups GROUPS (Optional) If exists, allows assigning Run:ai role groups via the IdP. The IdP attribute must be of a type of list of strings. See more below Linux User ID UID (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the Linux User <code>UID</code>. Used to map access to network resources such as file systems to users. The IdP attribute must be of integer type. Linux Group ID GID (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the Linux Group <code>GID</code>. The IdP attribute must be of integer type. Linux Supplementary Groups SUPPLEMENTARYGROUPS (configurable) (Optional) If exists in IdP, allows Researcher containers to start with the relevant Linux supplementary groups. The IdP attribute must be of a type of list of integers. User first name firstName (configurable) (Optional) Used as the first name showing in the Run:ai user interface. User last name lastName (configurable) (Optional) Used as the last name showing in the Run:ai user interface"},{"location":"admin/runai-setup/authentication/sso/#example-attribute-mapping-for-google-suite","title":"Example attribute mapping for Google Suite","text":"<p>If you are using Google Suite as your Identity provider, to map custom attributes follow the Google support article. Use the Whole Number attribute type. For Supplementary Groups use the Multi-value designation.</p>"},{"location":"admin/runai-setup/authentication/sso/#step-1-ui-configuration","title":"Step 1: UI Configuration","text":"<ol> <li>Open the Administration User interface.</li> <li>Go to <code>Settings | General</code>.</li> <li>Turn on <code>Login with SSO</code>.</li> <li>Enter the administrator email.</li> <li>Select the SSO protocol. Choose <code>Saml 2</code> or <code>Open ID Connect</code>.</li> </ol> <p>Note</p> <p>Use your SAML response file to fill in the fields below.</p> <p>For <code>Saml 2</code>:</p> <ol> <li>In the <code>Metadata XML Url</code> field, enter the URL to the XML Metadata file.</li> <li>In the <code>GID</code> field, enter the GID.</li> <li>In the <code>GROUPS</code> field, enter the groups.</li> <li>In the <code>SUPPLEMENTARYGROUPS</code> field, enter the supplementary groups.</li> <li>In the <code>UID</code> field, enter the UID.</li> <li>In the <code>Logout uri</code> field, enter the desired URL logout page. If left empty, you will be redirected to the Run:ai portal.</li> <li>Press <code>Save</code>.</li> </ol> <p>For <code>Open ID Connect</code>:</p> <p> Version 2.10 and later.</p> <ol> <li>In the <code>Discovery Document URL</code> field, enter the URL to the discovery document.</li> <li>In the <code>Client ID</code> field, enter the client ID.</li> <li>In the <code>Client Secret</code> field, enter the client secret.</li> <li>In the <code>GID</code> field, enter the GID.</li> <li>In the <code>GROUPS</code> field, enter the groups.</li> <li>In the <code>SUPPLEMENTARYGROUPS</code> field, enter the supplementary groups.</li> <li>In the <code>UID</code> field, enter the UID.</li> <li>In the <code>Logout uri</code> field, enter the desired URL logout page. If left empty, you will be redirected to the Run:ai portal.</li> <li>Press <code>Save</code>.</li> </ol> <p>Once you press <code>Save</code> you will receive a <code>Redirect URI</code> and an <code>Entity ID</code>. Both values must be set on the IdP side.</p> <p>Important</p> <p>Upon pressing <code>Save</code>, all existing users will be rendered non-functional, and the only valid user will be the Administrator email entered above. You can always revert by disabling Login via SSO.</p>"},{"location":"admin/runai-setup/authentication/sso/#test","title":"Test","text":"<p>Test Connectivity to Administration User Interface:</p> <ul> <li>Using an incognito browser tab and open the Run:ai user interface.</li> <li>Select the <code>Login with SSO</code> button.</li> <li>Provide the <code>Organization name</code> obtained above.</li> <li>You will be redirected to the IdP login page. Use the previously entered Administrator email* to log in.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting","title":"Troubleshooting","text":"<p>The SSO log in can be separated into two parts:</p> <ol> <li>Run:ai redirects to the IdP (for example, Google) for login using a SAML Request.</li> <li>Upon successful login, IdP redirects back to Run:ai with a SAML Response.</li> </ol> <p>You can follow that by following the URL changes from app.run.ai to the IdP provider (for example, accounts.google.com) and back to app.run.ai:</p> <ul> <li>If there is an issue on the IdP site (for example, <code>app_is_not_configred</code> error in Google), the problem is likely to be in the SAML Request.</li> <li>If the user is redirected back to Run:ai and something goes wrong, the problem is most likely in the SAML Response.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting-saml-request","title":"Troubleshooting SAML Request","text":"<ul> <li>When logging in, have the Chrome network inspector open (Open by <code>Right-Click | Inspect</code> on the page, then open the network tab).</li> <li>After the IdP login screen shows, search in the network tab for an HTTP request showing the SAML Request. Depending on the IdP this would be a request to the IdP domain name. For example, accounts.google.com/idp?1234.</li> <li>When found, go to the \"Payload\" tab and copy the value of the SAML Request.</li> <li>Paste the value into a SAML decoder. A typical response should look like this:</li> </ul> <pre><code>&lt;?xml version=\"1.0\"?&gt;\n&lt;samlp:AuthnRequest xmlns:samlp=\"urn:oasis:names:tc:SAML:2.0:protocol\" xmlns=\"urn:oasis:names:tc:SAML:2.0:assertion\" xmlns:saml=\"urn:oasis:names:tc:SAML:2.0:assertion\" AssertionConsumerServiceURL=\"https://.../auth/realms/runai/broker/saml/endpoint\" Destination=\"https://accounts.google.com/o/saml2/idp?idpid=....\" ForceAuthn=\"false\" ID=\"ID_66da617d-b862-4cca-9ei5-b727a920f3cb\" IssueInstant=\"2022-01-12T12:54:22.907Z\" ProtocolBinding=\"urn:oasis:names:tc:SAML:2.0:bindings:HTTP-POST\" Version=\"2.0\"&gt;\n&lt;saml:Issuer&gt;runai-jtqee5v8ob&lt;/saml:Issuer&gt;\n&lt;samlp:NameIDPolicy AllowCreate=\"true\" Format=\"urn:oasis:names:tc:SAML:2.0:nameid-format:persistent\"/&gt;\n&lt;/samlp:AuthnRequest&gt;\n</code></pre> <p>Check in the above that:</p> <ul> <li>The content of the <code>&lt;saml:Issuer&gt;</code> tag is the same as <code>Entity ID</code> defined above.</li> <li><code>AssertionConsumerServiceURL</code> is the same as the <code>Redirect URI</code>.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#troubleshooting-saml-response","title":"Troubleshooting SAML Response","text":"<ul> <li>When logging in, have the Chrome network inspector open (Open by <code>Right-Click | Inspect</code> on the page, then open the network tab).</li> <li>Search for \"endpoint\".</li> <li>When found, go to the \"Payload\" tab and copy the value of the SAML Response.</li> <li>Paste the value into a SAML decoder. A typical response should look like this:</li> </ul> <pre><code>&lt;?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?&gt;\n&lt;saml2p:Response\nxmlns:saml2p=\"urn:oasis:names:tc:SAML:2.0:protocol\" Destination=\"https://.../auth/realms/runai/broker/saml/endpoint\" ID=\"_2d085ed4f45a7ab221a49e6c02e30cac\" InResponseTo=\"ID_295f2723-79f5-4410-99b2-5f4acb2d4f8e\" IssueInstant=\"2022-01-12T12:06:31.175Z\" Version=\"2.0\"&gt;\n&lt;saml2:Issuer\nxmlns:saml2=\"urn:oasis:names:tc:SAML:2.0:assertion\"&gt;https://accounts.google.com/o/saml2?idpid=....\n    &lt;/saml2:Issuer&gt;\n&lt;saml2p:Status&gt;\n&lt;saml2p:StatusCode Value=\"urn:oasis:names:tc:SAML:2.0:status:Success\"/&gt;\n&lt;/saml2p:Status&gt;\n&lt;saml2:Assertion\nxmlns:saml2=\"urn:oasis:names:tc:SAML:2.0:assertion\" ID=\"_befe8441fa06594b365c516558dc5636\" IssueInstant=\"2022-01-12T12:06:31.175Z\" Version=\"2.0\"&gt;\n&lt;saml2:Issuer&gt;https://accounts.google.com/o/saml2?idpid=...&lt;/saml2:Issuer&gt;\n&lt;ds:Signature\nxmlns:ds=\"http://www.w3.org/2000/09/xmldsig#\"&gt;\n&lt;ds:SignedInfo&gt;\n&lt;ds:CanonicalizationMethod Algorithm=\"http://www.w3.org/2001/10/xml-exc-c14n#\"/&gt;\n&lt;ds:SignatureMethod Algorithm=\"http://www.w3.org/2001/04/xmldsig-more#rsa-sha256\"/&gt;\n&lt;ds:Reference URI=\"#_befe8441fa06594b365c516558dc5636\"&gt;\n&lt;ds:Transforms&gt;\n&lt;ds:Transform Algorithm=\"http://www.w3.org/2000/09/xmldsig#enveloped-signature\"/&gt;\n&lt;ds:Transform Algorithm=\"http://www.w3.org/2001/10/xml-exc-c14n#\"/&gt;\n&lt;/ds:Transforms&gt;\n&lt;ds:DigestMethod Algorithm=\"http://www.w3.org/2001/04/xmlenc#sha256\"/&gt;\n&lt;ds:DigestValue&gt;QxNCjtz9Gomv2qaz8Rb4X8cQJOSGkK+87CrHDkBPidM=&lt;/ds:DigestValue&gt;\n&lt;/ds:Reference&gt;\n&lt;/ds:SignedInfo&gt;\n&lt;ds:SignatureValue&gt;...&lt;/ds:SignatureValue&gt;\n&lt;ds:KeyInfo&gt;\n&lt;ds:X509Data&gt;\n&lt;ds:X509SubjectName&gt;ST=California,C=US,OU=Google For Work,CN=Google,L=Mountain View,O=Google Inc.&lt;/ds:X509SubjectName&gt;\n&lt;ds:X509Certificate&gt;...&lt;/ds:X509Certificate&gt;\n&lt;/ds:X509Data&gt;\n&lt;/ds:KeyInfo&gt;\n&lt;/ds:Signature&gt;\n&lt;saml2:Subject&gt;\n&lt;saml2:NameID Format=\"urn:oasis:names:tc:SAML:2.0:nameid-format:persistent\"&gt;john@example.com&lt;/saml2:NameID&gt;\n&lt;saml2:SubjectConfirmation Method=\"urn:oasis:names:tc:SAML:2.0:cm:bearer\"&gt;\n&lt;saml2:SubjectConfirmationData InResponseTo=\"ID_295f2723-79f5-4410-99b2-5f4acb2d4f8e\" NotOnOrAfter=\"2022-01-12T12:11:31.175Z\" Recipient=\"https://.../auth/realms/runai/broker/saml/endpoint\"/&gt;\n&lt;/saml2:SubjectConfirmation&gt;\n&lt;/saml2:Subject&gt;\n&lt;saml2:Conditions NotBefore=\"2022-01-12T12:01:31.175Z\" NotOnOrAfter=\"2022-01-12T12:11:31.175Z\"&gt;\n&lt;saml2:AudienceRestriction&gt;\n&lt;saml2:Audience&gt;runai-jtqee5v8ob&lt;/saml2:Audience&gt;\n&lt;/saml2:AudienceRestriction&gt;\n&lt;/saml2:Conditions&gt;\n&lt;saml2:AttributeStatement&gt;\n&lt;saml2:Attribute Name=\"email\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;john@example.com\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"GID\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;8765\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"SUPPLEMENTARYGROUPS\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;200\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;300\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;400\n                &lt;/saml2:AttributeValue&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;100\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;saml2:Attribute Name=\"UID\"&gt;\n&lt;saml2:AttributeValue\nxmlns:xs=\"http://www.w3.org/2001/XMLSchema\"\nxmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"xs:anyType\"&gt;4321\n                &lt;/saml2:AttributeValue&gt;\n&lt;/saml2:Attribute&gt;\n&lt;/saml2:AttributeStatement&gt;\n&lt;saml2:AuthnStatement AuthnInstant=\"2022-01-12T12:06:30.000Z\" SessionIndex=\"_befe8441fa06594b365c516558dc5636\"&gt;\n&lt;saml2:AuthnContext&gt;\n&lt;saml2:AuthnContextClassRef&gt;urn:oasis:names:tc:SAML:2.0:ac:classes:unspecified&lt;/saml2:AuthnContextClassRef&gt;\n&lt;/saml2:AuthnContext&gt;\n&lt;/saml2:AuthnStatement&gt;\n&lt;/saml2:Assertion&gt;\n&lt;/saml2p:Response&gt;\n</code></pre> <p>Check in the above that:</p> <ul> <li>The content of the <code>&lt;saml2:Audience&gt;</code> tag is the same as <code>Entity ID</code> defined above.</li> <li>The <code>Destination</code> at the top is the same as the <code>Redirect URI</code>.</li> <li>The user email under the <code>&lt;saml2:Subject&gt;</code> tag is the same as the logged-in user.</li> <li>Make sure that under the <code>&lt;saml2:AttributeStatement&gt;</code> tag, there is an Attribute named <code>email</code> (lowercase). This attribute is mandatory.</li> <li>If other, optional attributes (such as UID, GID) are mapped, make sure they exist under <code>&lt;saml2:AttributeStatement&gt;</code> along with their respective values.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#step-2-cluster-authentication","title":"Step 2: Cluster Authentication","text":"<p>Researchers should be authenticated when accessing the Run:ai GPU Cluster. To perform that, the Kubernetes cluster and the user's Kubernetes profile must be aware of the IdP. Follow the instructions here. If you have followed these instructions in the past, you must do so again and replace the client-side and server-side configuration values with the new values as provided by on <code>Settings | General | Researcher Authentication</code>.</p>"},{"location":"admin/runai-setup/authentication/sso/#connectivity-test","title":"Connectivity test","text":"<p>Test connectivity to Run:ai command-line interface:</p> <ul> <li>In the command-line, run <code>runai login</code>.</li> <li>You will receive a link that you must copy and open in your browser. Post login you will receive a verification code which you must paste into the shell window.</li> <li>Verify successful login.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#step-3-uidgid-mapping","title":"Step 3: UID/GID Mapping","text":"<p>Configure the IdP to add UID, GID, and Supplementary groups in the IdP.</p>"},{"location":"admin/runai-setup/authentication/sso/#mapping-test","title":"Mapping test","text":"<p>Test the mapping of UID/GID to within the container:</p> <p>Submit a job with the flag <code>--run-as-user</code>, for example:</p> <pre><code>runai submit -i ubuntu --interactive --run-as-user --attach -- bash\n</code></pre> <p>When a shell opens inside the container, run <code>id</code> and verify that UID, GID, and the supplementary groups are the same as in the user's profile in the organization's directory.</p>"},{"location":"admin/runai-setup/authentication/sso/#step-4-adding-users","title":"Step 4: Adding Users","text":"<p>You can add additional users, by either:</p> <ol> <li>Manually adding roles for each user.</li> <li>Mapping roles to IdP groups.</li> </ol> <p>The latter option is easier to maintain.</p>"},{"location":"admin/runai-setup/authentication/sso/#adding-roles-for-a-user","title":"Adding Roles for a User","text":"<ul> <li>Go to <code>Settings | Users</code>.</li> <li>Select the <code>Users</code> button at the top.</li> <li>Map users as explained here.</li> </ul>"},{"location":"admin/runai-setup/authentication/sso/#mapping-role-groups","title":"Mapping Role Groups","text":"<ul> <li>Go to <code>Settings | Users</code>.</li> <li>Select the <code>Groups</code> button.</li> <li>Assuming you have mapped the IdP <code>Groups</code> attribute as described in the prerequisites section above, add a name of a group that has been created in the directory and create an equivalent Run:ai Group.</li> <li>If the role group contains the <code>Researcher</code> role, you can assign this group to a Run:ai Project. All members of the group will have access to the cluster.</li> </ul> <p>Note</p> <p>This feature also works in OpenShift. If you create a group in Run:ai with the same name as an OpenShift Group, the associated permissions will be applied to all users in the group.</p>"},{"location":"admin/runai-setup/authentication/sso/#implementation-notes","title":"Implementation Notes","text":"<p>Run:ai SSO does not support single logout. As such, logging out from Run:ai will not log you out from other systems.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-delete/","title":"Deleting a Cluster Installation","text":"<p>To delete a Run:ai Cluster installation while retaining existing running jobs, run the following commands:</p> Version 2.9 or laterVersion 2.8Version 2.7 or earlier <pre><code>helm delete runai-cluster -n runai\n</code></pre> <pre><code>kubectl delete RunaiConfig runai -n runai\nhelm delete runai-cluster -n runai\n</code></pre> <pre><code>kubectl patch RunaiConfig runai -n runai -p '{\"metadata\":{\"finalizers\":[]}}' --type=\"merge\"\nkubectl delete RunaiConfig runai -n runai\nhelm delete runai-cluster runai -n runai\n</code></pre> <p>The commands will not delete existing Jobs submitted by users. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/","title":"Cluster Install","text":"<p>Below are instructions on how to install a Run:ai cluster. Before installing, please review the installation prerequisites here: Run:ai GPU Cluster Prerequisites. </p> <p>Important</p> <ul> <li>We strongly recommend running the Run:ai pre-install script to verify that all prerequisites are met. </li> <li>Starting version 2.9 you must pre-install  NGINX ingress controller</li> <li>Starting version 2.9 you must pre-install the Prometheus stack.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#install-runai","title":"Install Run:ai","text":"<p>Log in to Run:ai user interface at <code>&lt;company-name&gt;.run.ai</code>. Use credentials provided by Run:ai Customer Support:</p> <ul> <li>If no clusters are currently configured, you will see a Cluster installation wizard.</li> <li>If a cluster has already been configured, use the menu on the top left and select \"Clusters\". On the top right, click \"Add New Cluster\".</li> </ul> <p>Using the Wizard:</p> <ol> <li>Choose a target Kubernetes platform (see table above).</li> <li>Use the combo box to select your cluster version.</li> <li>(SaaS and remote self-hosted cluster only) Provide a domain name for your cluster as described here.</li> <li>(SaaS and remote self-hosted cluster only) Install a trusted certificate to the domain within Kubernetes.</li> <li>Download a Helm values YAML file <code>runai-&lt;cluster-name&gt;.yaml</code>.</li> <li>(Optional) customize the values file. See Customize Cluster Installation.</li> <li>Install Helm.</li> <li>Run the <code>helm</code> commands as provided in the wizard.</li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#verify-your-installation","title":"Verify your Installation","text":"<ul> <li>Go to <code>&lt;company-name&gt;.run.ai/dashboards/now</code>.</li> <li>Verify that the number of GPUs on the top right reflects your GPU resources on your cluster and the list of machines with GPU resources appears on the bottom line.</li> </ul> <p> Version 2.9 and up </p> <p>Run: <code>kubectl get cm runai-public -n runai -o jsonpath='{.data}' | yq -P</code></p> <p>(assumes the yq is instaled)</p> <p>Example output:</p> <pre><code>cluster-version: 2.9.0\nrunai-public: version: 2.9.0\nrunaiConfigStatus: # (1)\nconditions:\n- type: DependenciesFulfilled\nstatus: \"True\"\nreason: dependencies_fulfilled\nmessage: Dependencies are fulfilled\n- type: Deployed\nstatus: \"True\"\nreason: deployed\nmessage: Resources Deployed\n- type: Available\nstatus: \"True\"\nreason: available\nmessage: System Available\n- type: Reconciled\nstatus: \"True\"\nreason: reconciled\nmessage: Reconciliation completed successfully\noptional:  # (2)\nknative:    # (3)  \ncomponents:\nhpa:\navailable: true\nknative:\navailable: true\nkourier:\navailable: true\nmpi:        # (4) \navailable: true\n</code></pre> <ol> <li>Verifies that all mandatory dependencies are met: NVIDIA GPU Operator, Prometheus and NGINX controller. </li> <li>Checks whether optional product dependencies have been met.</li> <li>See Inference prerequisites.</li> <li>See distributed training prerequisites.</li> </ol> <p>For a more extensive verification of cluster health, see Determining the health of a cluster.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#researcher-authentication","title":"Researcher Authentication","text":"<p>You must now set up Researcher Access Control. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#optional-set-node-roles","title":"(Optional) Set Node Roles","text":"<p>When installing a production cluster you may want to:</p> <ul> <li>Set one or more Run:ai system nodes. These are nodes dedicated to Run:ai software. </li> <li>Machine learning frequently requires jobs that require CPU but not GPU. You may want to direct these jobs to dedicated nodes that do not have GPUs, so as not to overload these machines. </li> <li>Limit Run:ai to specific nodes in the cluster. </li> </ul> <p>To perform these tasks. See Set Node Roles.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-install/#next-steps","title":"Next Steps","text":"<ul> <li>Set up Run:ai Users Working with Users.</li> <li>Set up Projects for Researchers Working with Projects.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See  Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenance scenarios.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/","title":"Prerequisites","text":"<p>Below are the prerequisites of a cluster installed with Run:ai. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prerequisites-in-a-nutshell","title":"Prerequisites in a Nutshell","text":"<p>The following is a checklist of the Run:ai prerequisites:</p> Prerequisite Details Kubernetes Verify certified vendor and correct version. NVIDIA GPU Operator Different Kubernetes flavors have slightly different setup instructions.   Verify correct version. Ingress Controller Install and configure NGINX (some Kubernetes flavors have NGINX pre-installed). Prometheus Install Prometheus. Trusted domain name You must provide a trusted domain name. Accessible only inside the organization (Optional) Distributed Training Install Kubeflow Training Operator if required. (Optional) Inference Some third party software needs to be installed to use the Run:ai inference module. <p>There are also specific hardware, operating system and network access requirements. A pre-install script is available to test if the prerequisites are met before installation. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#software-requirements","title":"Software Requirements","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#operating-system","title":"Operating System","text":"<ul> <li>Run:ai will work on any Linux operating system that is supported by both Kubernetes and NVIDIA. </li> <li>An important highlight is that GKE (Google Kubernetes Engine) will only work with Ubuntu, as NVIDIA does not support the default Container-Optimized OS with Containerd image.</li> <li>Run:ai performs its internal tests on Ubuntu 20.04 and CoreOS for OpenShift. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#kubernetes","title":"Kubernetes","text":"<p>Run:ai requires Kubernetes. Run:ai is been certified with the following Kubernetes distributions: </p> Kubernetes Distribution Description Installation Notes Vanilla Kubernetes Using no specific distribution but rather k8s native installation See instructions for a simple (non-production-ready) Kubernetes Installation script. OCP OpenShift Container Platform The Run:ai operator is certified for OpenShift by Red Hat. EKS Amazon Elastic Kubernetes Service AKS Azure Kubernetes Services GKE Google Kubernetes Engine RKE Rancher Kubernetes Engine When installing Run:ai, select On Premise. RKE2 has a defect which requires a specific installation flow. Please contact Run:ai customer support for additional details. Bright NVIDIA Bright Cluster Manager In addition, NVIDIA DGX comes bundled with Run:ai <p>Run:ai has been tested with the following Kubernetes distributions. Please contact Run:ai Customer Support for up to date certification details: </p> Kubernetes Distribution Description Installation Notes Ezmeral HPE Ezmeral Container Platform See Run:ai at Ezmeral marketplace Tanzu VMWare Kubernetes Tanzu supports containerd rather than docker. See the NVIDIA prerequisites below as well as cluster customization for changes required for containerd <p>Following is a Kubernetes support matrix for the latest Run:ai releases:</p> Run:ai version Supported Kubernetes versions Supported OpenShift versions Run:ai 2.9 1.21 through 1.26 4.8 through 4.11 Run:ai 2.10 1.21 through 1.26 (see note below) 4.8 through 4.11 Run:ai 2.12 1.23 through 1.27 (see note below) 4.10 through 4.12 Run:ai 2.13 1.23 through 1.27 (see note below) 4.10 through 4.12 <p>Note</p> <p>Run:ai allows scheduling of Jobs with PVCs. See for example the command-line interface flag --pvc-new. A Job scheduled with a PVC based on a specific type of storage class (a storage class with the property <code>volumeBindingMode</code> equals to <code>WaitForFirstConsumer</code>) will not work on Kubernetes 1.23 or lower.</p> <p>For an up-to-date end-of-life statement of Kubernetes see Kubernetes Release History.</p> <p>Run:ai does not support Pod Security Admission. Support for Pod Security Policy has been removed with Run:ai 2.9.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#nvidia","title":"NVIDIA","text":"<p>Run:ai has been certified on NVIDIA GPU Operator  22.9 to 23.3. Older versions (1.10 and 1.11) have a documented NVIDIA issue. </p> <p>Follow the Getting Started guide to install the NVIDIA GPU Operator, or see the distribution-specific instructions below:</p> EKSGKERKE2 <ul> <li>When setting up EKS, do not install the NVIDIA device plug-in  (as we want the NVIDIA GPU Operator to install it instead). When using the eksctl tool to create an AWS EKS cluster, use the flag <code>--install-nvidia-plugin=false</code> to disable this install.</li> <li>Follow the Getting Started guide to install the NVIDIA GPU Operator. For GPU nodes, EKS uses an AMI which already contains the NVIDIA drivers. As such, you must use the GPU Operator flags: <code>--set driver.enabled=false</code>. </li> </ul> <p>Create the <code>gpu-operator</code> namespace by running</p> <pre><code>kubectl create ns gpu-operator\n</code></pre> <p>Before installing the GPU Operator you must create the following file:</p> resourcequota.yaml<pre><code>apiVersion: v1\nkind: ResourceQuota\nmetadata:\nname: gcp-critical-pods\nnamespace: gpu-operator\nspec:\nscopeSelector:\nmatchExpressions:\n- operator: In\nscopeName: PriorityClass\nvalues:\n- system-node-critical\n- system-cluster-critical\n</code></pre> <p>Then run: <code>kubectl apply -f resourcequota.yaml</code></p> <p>Important</p> <ul> <li>Run:ai on GKE has only been tested with GPU Operator version 22.9 and up.</li> <li>The above only works for Run:ai 2.7.16 and above. </li> </ul> <p>Install the NVIDIA GPU Operator as discussed here.</p> <p>Notes</p> <ul> <li>Use the default namespace <code>gpu-operator</code>. Otherwise, you must specify the target namespace using the flag <code>runai-operator.config.nvidiaDcgmExporter.namespace</code> as described in customized cluster installation.</li> <li>NVIDIA drivers may already be installed on the nodes. In such cases, use the NVIDIA GPU Operator flags <code>--set driver.enabled=false</code>. DGX OS is one such example as it comes bundled with NVIDIA Drivers.  </li> <li>To use Dynamic MIG, the GPU Operator must be installed with the flag <code>mig.strategy=mixed</code>. If the GPU Operator is already installed, edit the clusterPolicy by running <code>kubectl patch clusterPolicy cluster-policy -n gpu-operator --type=merge -p '{\"spec\":{\"mig\":{\"strategy\": \"mixed\"}}}</code></li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#ingress-controller","title":"Ingress Controller","text":"<p>Run:ai requires an ingress controller as a prerequisite. The Run:ai cluster installation configures one or more ingress objects on top of the controller. </p> <p>There are many ways to install and configure an ingress controller and configuration is environment-dependent. A simple solution is to install &amp; configure NGINX:</p> On PremRKEManaged Kubernetes <pre><code>helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm upgrade -i nginx-ingress ingress-nginx/ingress-nginx   \\\n--namespace nginx-ingress --create-namespace \\\n--set controller.kind=DaemonSet \\\n--set controller.service.externalIPs=\"{&lt;INTERNAL-IP&gt;,&lt;EXTERNAL-IP&gt;}\" # (1)\n</code></pre> <ol> <li>External and internal IP of one of the nodes</li> </ol> <p>RKE and RKE2 come pre-installed with NGINX. No further action needs to be taken. </p> <p>For managed Kubernetes such as EKS: </p> <pre><code>helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx\nhelm repo update\nhelm install nginx-ingress ingress-nginx/ingress-nginx \\\n--namespace nginx-ingress --create-namespace </code></pre> <p>For support of ingress controllers different than NGINX please contact Run:ai customer support. </p> <p>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, there is no need to install an ingress controller as it is pre-installed by the control plane.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#cluster-url","title":"Cluster URL","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#cluster-ip","title":"Prerequisites","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#domain-name","title":"Prerequisites","text":"<p>The Run:ai cluster creation wizard requires a domain name (FQDN) to the Kubernetes cluster as well as a trusted certificate for that domain. The domain name needs to be accessible inside the organization only.</p> <p>Use an HTTPS-based domain (e.g. https://my-cluster.com) as the cluster URL. Make sure that the DNS is configured with the cluster IP.</p> <p>In addition, to configure HTTPS for your URL, you must create a TLS secret named <code>runai-cluster-domain-tls-secret</code> in the <code>runai</code> namespace. The secret should contain a trusted certificate for the domain:</p> <pre><code>kubectl create ns runai\nkubectl create secret tls runai-cluster-domain-tls-secret -n runai \\\n--cert /path/to/fullchain.pem  \\ # (1)\n--key /path/to/private.pem # (2)\n</code></pre> <ol> <li>The domain's cert (public key).</li> <li>The domain's private key. </li> </ol> <p>For more information on how to create a TLS secret see: https://kubernetes.io/docs/concepts/configuration/secret/#tls-secrets.</p> <p>Note</p> <p>In a self-hosted installation, the typical scenario is to install the first Run:ai cluster on the same Kubernetes cluster as the control plane. In this case, the cluster URL need not be provided as it will be the same as the control-plane URL. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#prometheus","title":"Prometheus","text":"<p>If not already installed on your cluster, install the full <code>kube-prometheus-stack</code> through the Prometheus community Operator. </p> <p>Note</p> <ul> <li>If Prometheus has been installed on the cluster in the past, even if it was uninstalled (such as when upgrading from Run:ai 2.8 or lower), you will need to update Prometheus CRDs as described here. For more information on the  Prometheus bug see here.</li> <li>If you are running Kubernetes 1.21, you must install a Prometheus stack version of 45.23.0 or lower. Use the <code>--version</code> flag below. Alternatively, use Helm version 3.12 or later. For more information on the related Prometheus bug see here</li> </ul> <p>Then install the Prometheus stack by running:</p> <pre><code>helm repo add prometheus-community https://prometheus-community.github.io/helm-charts\nhelm repo update\nhelm install prometheus prometheus-community/kube-prometheus-stack \\\n-n monitoring --create-namespace --set grafana.enabled=false # (1)\n</code></pre> <ol> <li>The Grafana component is not required for Run:ai. </li> </ol>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#optional-software-requirements","title":"Optional Software Requirements","text":"<p>The following software enables specific features of Run:ai</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#distributed-training","title":"Distributed Training","text":"<p>Run:ai supports three different methods to distributed-training jobs across multiple nodes:</p> <ul> <li>MPI</li> <li>TensorFlow</li> <li>PyTorch</li> </ul> <p>To install all three, run the following:</p> <pre><code>kubectl apply -k \"github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0\"\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference","title":"Inference","text":"<p>To use the Run:ai inference module you must pre-install Knative Serving. Follow the instructions here to install. Run:ai is certified on Knative 1.4 to 1.8 with Kubernetes 1.22 or later.  </p> <p>Post-install, you must configure Knative to use the Run:ai scheduler and allow pod affinity, by running: </p> <pre><code>kubectl patch configmap/config-features \\\n  --namespace knative-serving \\\n  --type merge \\\n  --patch '{\"data\":{\"kubernetes.podspec-schedulername\":\"enabled\",\"kubernetes.podspec-affinity\":\"enabled\"}}'\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#inference-autoscaling","title":"Inference Autoscaling","text":"<p>Run:ai allows to autoscale a deployment according to various metrics:</p> <ol> <li>GPU Utilization (%)</li> <li>CPU Utilization (%)</li> <li>Latency (milliseconds)</li> <li>Throughput (requests/second)</li> <li>Concurrency </li> <li>Any custom metric</li> </ol> <p>Additional installation may be needed for some of the metrics as follows:</p> <ul> <li>Using Throughput or Concurrency does not require any additional installation.</li> <li>Any other metric will require installing the HPA Autoscaler.</li> <li>Using GPU Utilization, Latency or Custom metric will also require the Prometheus adapter. The Prometheus adapter is part of the Run:ai installer and can be added by setting the <code>prometheus-adapter.enabled</code> flag to <code>true</code>. See Customizing the Run:ai installation for further information.</li> </ul> <p>If you wish to use an existing Prometheus adapter installation, you will need to configure it manually with the Run:ai Prometheus rules, specified in the Run:ai chart values under <code>prometheus-adapter.rules</code> field. For further information please contact Run:ai customer support. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#accessing-inference-from-outside-the-cluster","title":"Accessing Inference from outside the Cluster","text":"<p>Inference workloads will typically be accessed by consumers residing outside the cluster. You will hence want to provide consumers with a URL to access the workload. The URL can be found in the Run:ai user interface under the deployment screen (alternatively, run <code>kubectl get ksvc -n &lt;project-namespace&gt;</code>). </p> <p>However, for the URL to be accessible outside the cluster you must configure your DNS as described here.</p> Alternative Configuration <p>When the above DNS configuration is not possible, you can manually add the <code>Host</code> header to the REST request as follows:</p> <ul> <li>Get an <code>&lt;external-ip&gt;</code> by running <code>kubectl get service -n kourier-system kourier</code>. If you have been using istio during Run:ai installation, run:  <code>kubectl -n istio-system get service istio-ingressgateway</code> instead. </li> <li>Send a request to your workload by using the external ip, and place the workload url as a <code>Host</code> header. For example</li> </ul> <pre><code>curl http://&lt;external-ip&gt;/&lt;container-specific-path&gt;\n    -H 'Host: &lt;host-name&gt;'\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>(see picture below)</p> <ul> <li> <p>(Production only) Run:ai System Nodes: To reduce downtime and save CPU cycles on expensive GPU Machines, we recommend that production deployments will contain two or more worker machines, designated for Run:ai Software. The nodes do not have to be dedicated to Run:ai, but for Run:ai purposes we would need:</p> <ul> <li>8 CPUs</li> <li>16GB of RAM</li> <li>50GB of Disk space  </li> </ul> </li> <li> <p>Shared data volume: Run:ai uses Kubernetes to abstract away the machine on which a container is running:</p> <ul> <li>Researcher containers: The Researcher's containers need to be able to access data from any machine in a uniform way, to access training data and code as well as save checkpoints, weights, and other machine-learning-related artifacts. </li> <li>The Run:ai system needs to save data on a storage device that is not dependent on a specific node.  </li> </ul> <p>Typically, this is achieved via Kubernetes Storage class  based on Network File Storage (NFS) or Network-attached storage (NAS). </p> </li> <li> <p>Docker Registry: With Run:ai, Workloads are based on Docker images. For container images to run on any machine, these images must be downloaded from a docker registry rather than reside on the local machine (though this also is possible). You can use a public registry such as docker hub or set up a local registry on-prem (preferably on a dedicated machine). Run:ai can assist with setting up the repository.</p> </li> <li> <p>Kubernetes: Production Kubernetes installation requires separate nodes for the Kubernetes master. For more details see your specific Kubernetes distribution documentation. </p> </li> </ul> <p></p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#user-requirements","title":"User requirements","text":"<p>Usage of containers and images: The individual Researcher's work must be based on container images. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#network-access-requirements","title":"Network Access Requirements","text":"<p>Internal networking: Kubernetes networking is an add-on rather than a core part of Kubernetes. Different add-ons have different network requirements. You should consult the documentation of the specific add-on on which ports to open. It is however important to note that unless special provisions are made, Kubernetes assumes all cluster nodes can interconnect using all ports. </p> <p>Outbound network: Run:ai user interface runs from the cloud. All container nodes must be able to connect to the Run:ai cloud. Inbound connectivity (connecting from the cloud into nodes) is not required. If outbound connectivity is limited, the following exceptions should be applied: </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#during-installation","title":"During Installation","text":"<p>Run:ai requires an installation over the Kubernetes cluster. The installation access the web to download various images and registries. Some organizations place limitations on what you can pull from the internet. The following list shows the various solution components and their origin: </p> Name Description URLs Ports Run:ai  Repository Run:ai Helm Package Repository runai-charts.storage.googleapis.com 443 Docker Images Repository Run:ai images gcr.io/run-ai-prod 443 Docker Images Repository Third party Images hub.docker.com  and quay.io 443 Run:ai Run:ai   Cloud instance app.run.ai"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#post-installation","title":"Post Installation","text":"<p>In addition, once running, Run:ai requires an outbound network connection to the following targets:</p> Name Description URLs Ports Grafana Grafana Metrics Server prometheus-us-central1.grafana.net and runailabs.com 443 Run:ai Run:ai   Cloud instance app.run.ai 443, 53"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#network-proxy","title":"Network Proxy","text":"<p>If you are using a Proxy for outbound communication please contact Run:ai customer support</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt;\n</code></pre> <p>If the script shows warnings or errors, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/","title":"SaaS Cluster Setup Introduction","text":"<p>This section is a step-by-step guide for setting up a Run:ai cluster. </p> <ul> <li>A Run:ai cluster is installed on top of a Kubernetes cluster.</li> <li>A Run:ai cluster connects to the Run:ai control plane on the cloud. The control plane provides a control point as well as a monitoring and control user interface for Administrators and Researchers.</li> <li>A customer may have multiple Run:ai Clusters, all connecting to a single control plane.</li> </ul> <p>For additional details see the Run:ai system components</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#documents","title":"Documents","text":"<ul> <li>Review Run:ai cluster prerequisites.</li> <li>Step-by-step installation instructions.</li> <li>Look for troubleshooting tips if required.</li> <li>Upgrade cluster and delete cluster instructions. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#customization","title":"Customization","text":"<p>For a list of optional customizations see Customize Installation</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#additional-configuration","title":"Additional Configuration","text":"<p>For a list of advanced configuration scenarios such as configuring researcher authentication, Single sign-on limiting the installation to specific nodes, and more, see the Configuration Articles section.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-setup-intro/#next-steps","title":"Next Steps","text":"<p>After setting up the cluster, you may want to start setting up Researchers. See: Researcher Setup.</p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/","title":"Upgrading a Cluster Installation","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#find-out-runai-cluster-version","title":"Find out Run:ai Cluster version","text":"<p>To find the Run:ai cluster version, run:</p> <pre><code>helm list -n runai -f runai-cluster\n</code></pre> <p>and record the chart version in the form of <code>runai-cluster-&lt;version-number&gt;</code></p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-runai-cluster","title":"Upgrade Run:ai cluster","text":""},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-from-version-29-or-later","title":"Upgrade from version 2.9 or later","text":"<p>Run:</p> <pre><code>helm repo update\nhelm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster runai/runai-cluster -n runai -f values.yaml\n</code></pre> <p>Note</p> <p>To upgrade to a specific version of the Run:ai cluster, add <code>--version &lt;version-number&gt;</code> to the <code>helm upgrade</code> command. You can find the relevant version with <code>helm search repo</code> as described above. </p>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#upgrade-from-version-28-or-earlier","title":"Upgrade from version 2.8 or earlier","text":"<p>The process of upgrading from 2.7 or 2.8 requires uninstalling and then installing again. No data is lost during the process. </p> <p>Note</p> <p>The reason for this process is that Run:ai 2.9 cluster installation no longer installs pre-requisites. As such ownership of dependencies such as Prometheus will be undefined if a <code>helm upgrade</code> is run.</p> <p>The process:</p> <ul> <li>Delete the Run:ai cluster installation according to these instructions (do not delete the Run:ai cluster object from the user interface).</li> <li>Run: <code>kubectl delete svc -n kube-system runai-cluster-kube-prometh-kubelet</code> </li> <li> <p>Install the mandatory Run:ai prerequisites:</p> <ul> <li>If you have previously installed the SaaS version of Run:ai version 2.7 or below, you will need to install both Ingress Controller and Prometheus.</li> <li>If you have previously installed the SaaS version of Run:ai version 2.8 or any Self-hosted version of Run:ai, you will need to install Prometheus only.</li> </ul> </li> <li> <p>Install Run:ai cluster as described here</p> </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/cluster-upgrade/#verify-successful-installation","title":"Verify Successful Installation","text":"<p>See Verify your installation on how to verify a Run:ai cluster installation</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/","title":"(Optional) Customize Cluster Installation","text":"<p>The Run:ai cluster creation wizard requires the download of a Helm values file <code>runai-&lt;cluster-name&gt;.yaml</code>. The file may be edited to customize the cluster installation.</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#configuration-flags","title":"Configuration Flags","text":"Key Default Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code>if unwilling to provide Run:ai the ability to create namespaces. When set to false, will requires an additional manual step when creating new Run:ai Projects <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> when using PodSecurityPolicy or OpenShift <code>runai-operator.config.mps-server.enabled</code> <code>false</code> Set to <code>true</code> to allow the use of NVIDIA MPS. MPS is useful with Inference workloads <code>runai-operator.config.global.runtime</code> <code>docker</code> Defines the container runtime of the cluster (supports <code>docker</code> and <code>containerd</code>). Set to <code>containerd</code> when using Tanzu <code>runai-operator.config.global.nvidiaDcgmExporter.namespace</code> <code>gpu-operator</code> The namespace where dcgm-exporter (or gpu-operator) was installed <code>runai-operator.config.global.nvidiaDcgmExporter.installedFromGpuOperator</code> <code>true</code> Indicated whether the dcgm-exporter was installed via gpu-operator or not <code>kube-prometheus-stack.enabled</code> <code>true</code> (Version 2.8 or lower)  Set to <code>false</code> when the cluster has an existing Prometheus installation that is not based on the Prometheus operator. This setting requires Run:ai customer support <code>kube-prometheus-stack.prometheusOperator.enabled</code> <code>true</code> (Version 2.8 or lower)  Set to <code>false</code> when the cluster has an existing Prometheus installation based on the Prometheus operator and Run:ai should use the existing one rather than install a new one <code>prometheus-adapter.enabled</code> <code>false</code> (Version 2.8 or lower) Install Prometheus Adapter. Used for Inference workloads using a custom metric for autoscaling. Set to <code>true</code> if Prometheus Adapter is not already installed in the cluster <code>prometheus-adapter.prometheus</code> The address of the default Prometheus Service (Version 2.8 or lower) If you installed your own custom Prometheus Service, set this field accordingly with <code>url</code> and <code>port</code>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#prometheus","title":"Prometheus","text":"Version 2.9 or higherVersion 2.8 or lower <p>Not relevant</p> <p>The Run:ai Cluster installation uses Prometheus. There are 3 alternative configurations:</p> <ol> <li>Run:ai installs Prometheus (default).</li> <li>Run:ai uses an existing Prometheus installation based on the Prometheus operator.</li> <li>Run:ai uses an existing Prometheus installation based on a regular Prometheus installation.</li> </ol> <p>For option 2, disable the flag <code>kube-prometheus-stack.prometheusOperator.enabled</code>. For option 3, please contact Run:ai Customer support. </p> <p>For options 2 and 3, if you enabled <code>prometheus-adapter</code>, please configure it as described in the Prometheus Adapter documentation</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#understanding-custom-access-roles","title":"Understanding Custom Access Roles","text":"<p>To review the access roles created by the Run:ai Cluster installation, see Understanding Access Roles.</p>"},{"location":"admin/runai-setup/cluster-setup/customize-cluster-install/#manual-creation-of-namespaces","title":"Manual Creation of Namespaces","text":"<p>Run:ai Projects are implemented as Kubernetes namespaces. By default, the administrator creates a new Project via the Administration user interface which then triggers the creation of a Kubernetes namespace named <code>runai-&lt;PROJECT-NAME&gt;</code>. There are a couple of use cases that customers will want to disable this feature:</p> <ul> <li>Some organizations prefer to use their internal naming convention for Kubernetes namespaces, rather than Run:ai's default <code>runai-&lt;PROJECT-NAME&gt;</code> convention.</li> <li>Some organizations will not allow Run:ai to automatically create Kubernetes namespaces. </li> </ul> <p>Follow these steps to achieve this:</p> <ol> <li>Disable the namespace creation functionality. See the  <code>runai-operator.config.project-controller.createNamespaces</code> flag above.</li> <li>Create a Project using the Run:ai User Interface. </li> <li>Create the namespace if needed by running: <code>kubectl create ns &lt;NAMESPACE&gt;</code>. The suggested Run:ai default is <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Label the namespace to connect it to the Run:ai Project by running <code>kubectl label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;</code>, where <code>&lt;PROJECT_NAME&gt;</code> is the name of the project you have created in the Run:ai user interface above and <code>&lt;NAMESPACE&gt;</code> is the name you chose for your namespace.</li> </ol>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/","title":"NVIDIA DGX Bundle","text":""},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-nvidia-dgx-bundle","title":"Run:ai &amp; NVIDIA DGX Bundle","text":"<p>NVIDIA DGX is a line of NVIDIA-produced servers and workstations which specialize in using GPUs to accelerate deep learning applications.</p> <p>NVIDIA DGX comes bundled out of the box with Run:ai. The purpose of this document is to guide you through the process of installing and configuring Run:ai in this scenario</p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#nvidia-bright-cluster-manager","title":"NVIDIA Bright Cluster Manager","text":"<p>NVIDIA Bright Cluster Manager allows the deployment of software on NVIDIA DGX servers. During the installation of the DGX you will select <code>Run:ai</code> as well as Run:ai prerequisites from the Bright installer.</p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#prerequisites","title":"Prerequisites","text":""},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#software-prerequisites","title":"Software Prerequisites","text":"<p>Run:ai assumes the following components to be pre-installed:</p> <ul> <li><code>NVIDIA GPU Operator</code> - available for installation via the bright installer</li> <li><code>Prometheus</code> - available for installation via the bright installer</li> <li><code>Ingress controller</code> - NGINX is available for installation via the bright installer. </li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#runai-prerequisites","title":"Run:ai prerequisites","text":"<p>The Run:ai cluster installer will require the following:</p> <ul> <li><code>Run:ai tenant name</code> - provided by Run:ai customer support.</li> <li><code>Run:ai install secret</code> - provided by Run:ai customer support.</li> <li><code>Cluster URL</code> - your organization should provide you with a domain name.</li> <li><code>Private and public keys</code> -your organization should provide a trusted certificate for the above domain name. The Run:ai installer will require both private key and full-chain in PEM format. </li> <li>Post-installation - credentials for the Run:ai user interface. Provided by Run:ai customer support.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installing-runai-installer","title":"Installing Run:ai installer","text":"<p>Select Run:ai via the bright installer. Remember to select all of the above software prerequisites as well. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#using-the-runai-installer","title":"Using the Run:ai installer","text":"<p>Find out the cluster's IP address. Then browse to <code>http://&lt;CLUSTER-IP&gt;:30080/runai-installer</code>. Alternatively use the Bright landing page at <code>http://&lt;CLUSTER-IP&gt;/#runai</code>.  </p> <p>Note</p> <ul> <li>Use <code>http</code> rather than <code>https</code>.</li> <li>Use the IP and not a domain name.</li> </ul> <p>A wizard would open up containing 3 pages: Prerequisites, setup, and installation. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#prerequisites-page","title":"Prerequisites Page","text":"<p>The first, verification page, verifies that all of the above software prerequisites are met. Press the \"Verify\" button. You will not be able to continue unless all prerequisites are met. When all are met, press the <code>Continue</code> button. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#setup-page","title":"Setup Page","text":"<p>The setup page asks to provide all of the Run:ai prerequisites described above. The page will verify the Run:ai input (tenant name and install secret) but will not verify the validity of the cluster URL and certificate. If those are incorrect, the Run:ai installation will show as successful but certain aspects of Run:ai will not work. </p> <p>After filling up the form, press <code>Continue</code>. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#installation-page","title":"Installation page","text":"<p>The Run:ai installation will start. Depending on your download network speed the installation can take from 2 to 10 minutes. When the installation is successful you will see a <code>START USING RUN:AI</code> button. Press the button and enter your credentials to enter the Run:ai user interface. </p> <p>Save the URL for future use. </p>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#post-installation","title":"Post-installation.","text":"<p>Post installation, you will want to:</p> <ul> <li>(Mandatory) Set up Researcher Access Control. Without this, the Job Submit form will not work. Note the Bright section in that document.</li> <li>Set up Run:ai Users Working with Users.</li> <li>Set up Projects for Researchers Working with Projects.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/dgx-bundle/#troubleshooting","title":"Troubleshooting","text":"<p>The cluster installer is a pod in Kubernetes. The pod is responsible for the installation preparation and prerequisite gathering phase. In case of an error during this pre-installation, you need to gather the pod's log. </p> <p>Once the Run:ai cluster installation has started, the behavior is identical to any Run:ai cluster installation flavor. See the troubleshooting page.</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/","title":"Native Kubernetes Installation","text":"<p>Kubernetes is composed of master(s) and workers. The instructions and script below are for creating a bare-bones installation of a single master and several workers for testing purposes. For a more complex, production-grade, Kubernetes installation, use tools such as Rancher Kubernetes Engine, or review Kubernetes documentation to learn how to customize the native installation.</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#prerequisites","title":"Prerequisites:","text":"<ul> <li>The script below assumes all machines have Ubuntu 18.04 or later. For other Linux-based operating-systems see Kubernetes documentation. </li> <li>The script must be run with ROOT privileges.</li> <li>Inbound ports 6443,443,8080 must be allowed. </li> <li>The script support Kubernetes 1.24 or later.</li> </ul>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes","title":"Install Kubernetes","text":""},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes-master","title":"Install Kubernetes Master","text":"<ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to install Kubernetes master.</li> <li>Select the Kubernetes version you want or press <code>Enter</code> for the default script version. </li> <li>Select the CNI (networking) version or press <code>Enter</code> for the default.</li> </ul> <p>When the script finishes, it will prompt a join command_ to be run on all workers. Save the command for later use.</p> <p>Note</p> <p>The default token expires after 24 hours. If the token has expired, go to the master node and run <code>sudo kubeadm token create --print-join-command</code>. This will produce an up-to-date join command.</p> <p>Test that Kubernetes is up by running: <pre><code>kubectl get nodes\n</code></pre> Verify that the master node is ready</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#install-kubernetes-workers","title":"Install Kubernetes Workers","text":"<p>On each designated worker node:</p> <ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to install Kubernetes worker.</li> <li>Select the Kubernetes version you want or press <code>Enter</code> for the default script version. The version should be the same as the one selected for the Kubernetes master. </li> </ul> <p>When the script finishes, run the join command saved above. </p> <p>To test that the worker has successfully joined, on the master node run: <pre><code>kubectl get nodes\n</code></pre> Verify that the new worker node is showing and ready (may take a couple of seconds).</p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#avoiding-accidental-upgrades","title":"Avoiding Accidental Upgrades","text":"<p>To avoid an accidental upgrade of Kubernetes binaries during Linux upgrades, it is recommended to hold the version. Run the following on all nodes:</p> <pre><code>sudo apt-mark hold kubeadm kubelet kubectl\n</code></pre>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#next-steps","title":"Next Steps","text":"<p>The administrative Kubernetes profile can be found in the master node under the <code>.kube</code> folder. </p>"},{"location":"admin/runai-setup/cluster-setup/install-k8s/#reset-nodes","title":"Reset Nodes","text":"<p>The same script also contains an option to completely remove Kubernetes from nodes (master or workers). To use, run: </p> <ul> <li>Get the script by running:  <pre><code>wget https://raw.githubusercontent.com/run-ai/docs/master/install/kube-install.sh\n</code></pre></li> <li>Run the script with ROOT privileges: <code>sudo ./kube-install.sh</code></li> <li>When prompted, select the option to reset/delete kubernetes.</li> <li>Select yes when prompted to reset the cluster and remove Kubernetes packages.</li> </ul>"},{"location":"admin/runai-setup/config/access-roles/","title":"Understand the Kubernetes Cluster Access provided to Run:ai","text":"<p>Run:ai has configuration flags that control specific behavioral aspects of Run:ai. Specifically, those which require additional permissions. Such as automatic namespace/project creation, secret propagation, and more.</p> <p>The purpose of this document is to provide security officers with the ability to review what cluster-wide access Run:ai requires, and verify that it is in line with organizational policy, before installing the Run:ai cluster. </p>"},{"location":"admin/runai-setup/config/access-roles/#review-cluster-access-roles","title":"Review Cluster Access Roles","text":"<p>Run the folloinwg:</p> <pre><code>helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\nhelm install runai-cluster runai/runai-cluster -n runai -f runai-&lt;cluster-name&gt;.yaml \\\n        --dry-run &gt; cluster-all.yaml\n</code></pre> <p>The file <code>cluster-all.yaml</code> can be then be reviewed. You can use the internal filenames (provided in comments within the file) to gain more understanding according to the table below:</p> Folder File Purpose <code>clusterroles</code> <code>base.yaml</code> Mandatory Kubernetes Cluster Roles and Cluster Role Bindings <code>clusterroles</code> <code>project-controller-ns-creation.yaml</code> Automatic Project Creation and Maintenance. Provides Run:ai with the ability to create Kubernetes namespaces when the Run:ai administrator creates new Projects. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-rb-creation.yaml</code> Automatically assign Users to Projects. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-cluster-wide-secrets.yaml</code> Allow the propagation of Secrets. See Secrets in Jobs. Can be turned on/off via flag <code>clusterroles</code> <code>project-controller-limit-range.yaml</code> Disables the usage of the Kubernetes Limit Range feature. Can be turned on/off via flag <code>ocp</code> <code>scc.yaml</code> OpenShift-specific Security Contexts <code>priorityclasses</code> 4 files Folder contains a list of Priority Classes used by Run:ai"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/","title":"External access to Containers","text":""},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#introduction","title":"Introduction","text":"<p>Researchers working with containers may at times need to remotely access the container. Some examples:</p> <ul> <li>Using a Jupyter notebook that runs within the container</li> <li>Using PyCharm to run python commands remotely.</li> <li>Using TensorBoard to view machine learning visualizations</li> </ul> <p>This requires exposing container ports. When using docker, the way Researchers expose ports is by declaring them when starting the container. Run:ai has similar syntax.</p> <p>Run:ai is based on Kubernetes. Kubernetes offers an abstraction of the container's location. This complicates the exposure of ports. Kubernetes offers several options:</p> Method Description Prerequisites Port Forwarding Simple port forwarding allows access to the container via local and/or remote port. None NodePort Exposes the service on each Node\u2019s IP at a static port (the NodePort). You\u2019ll be able to contact the NodePort service from outside the cluster by requesting <code>&lt;NODE-IP&gt;:&lt;NODE-PORT&gt;</code> regardless of which node the container actually resides in. None LoadBalancer Exposes the service externally using a cloud provider\u2019s load balancer. Only available with cloud providers <p>See https://kubernetes.io/docs/concepts/services-networking/service for further details on these options.</p>"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#workspaces-configuration","title":"Workspaces configuration","text":"<p> Version 2.9 and up </p> <p>Version 2.9 introduces Workspaces which allow the Researcher to build AI models interactively. </p> <p>Workspaces allow the Researcher to launch tools such as Visual Studio code, TensorFlow, TensorBoard etc. These tools require access to the container. Access is provided via URLs. </p> <p>Run:ai uses the Cluster URL provided to dynamically create SSL-secured URLs for researchers\u2019 workspaces in the format of <code>https://&lt;CLUSTER_URL&gt;/project-name/workspace-name</code>.</p> <p>While this form of path-based routing conveniently works with applications like Jupyter Notebooks, it may often not be compatible with other applications. These applications assume running at the root file system, so hardcoded file paths and settings within the container may become invalid when running at a path other than the root. For instance, if the container is expecting to find a file at <code>/etc/config.json</code> but is running at <code>/project-name/workspace-name</code>, the file will not be found. This can cause the container to fail or not function as intended.</p> <p>To address this issue, Run:ai provides support for host-based routing. When enabled, Run:ai creates workspace URLs in a subdomain format (<code>https://project-name-workspace-name.&lt;CLUSTER_URL&gt;/</code>), which allows all workspaces to run at the root path and function properly. </p> <p>To enable host-based routing you must perform the following steps:</p> <ol> <li>Create a second DNS entry  <code>*.&lt;CLUSTER_URL&gt;</code>, pointing to the same IP as the original Cluster URL DNS.</li> <li> <p>Obtain a star SSL certificate for this DNS.</p> </li> <li> <p>Add the certificate as a secret:</p> </li> </ol> SaaSSelf hosted <pre><code>kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai \\ \n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre> <pre><code>kubectl create secret tls runai-cluster-domain-star-tls-secret -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre> <ol> <li>Create an ingress rule to direct traffic:</li> </ol> SaaSSelf hosted <pre><code>kubectl patch ingress researcher-service-ingress -n runai --type json \\\n    --patch '[{ \"op\": \"add\", \"path\": \"/spec/tls/-\", \"value\": { \"hosts\": [ \"*.&lt;CLUSTER_URL&gt;\" ], \"secretName\": \"runai-cluster-domain-star-tls-secret\" } }]'\n</code></pre> <pre><code>kubectl patch ingress runai-backend-ingress -n runai-backend --type json \\\n    --patch '[{ \"op\": \"add\", \"path\": \"/spec/tls/-\", \"value\": { \"hosts\": [ \"*.&lt;CLUSTER_URL&gt;\" ], \"secretName\": \"runai-cluster-domain-star-tls-secret\" } }]'\n</code></pre> <ol> <li>Edit Runaiconfig to generate the URLs correctly:</li> </ol> <pre><code>kubectl patch RunaiConfig runai -n runai --type=\"merge\" \\\n    -p '{\"spec\":{\"global\":{\"subdomainSupport\": true}}}' \n</code></pre> <p>Once these requirements have been met, all workspaces will automatically be assigned a secured URL with a subdomain, ensuring full functionality for all researcher applications.</p>"},{"location":"admin/runai-setup/config/allow-external-access-to-containers/#see-also","title":"See Also","text":"<ul> <li>To learn how to use port forwarding see the Quickstart document:  Launch an Interactive Build Workload with Connected Ports.</li> <li>See CLI command runai submit.</li> </ul>"},{"location":"admin/runai-setup/config/cli-admin-install/","title":"Install the Run:ai Administrator Command-line Interface","text":"<p>The Run:ai Administrator Command-line Interface (Administrator CLI) allows performing administrative tasks on the Run:ai Cluster.  </p> <p>The instructions below will guide you through the process of installing the Administrator CLI.</p>"},{"location":"admin/runai-setup/config/cli-admin-install/#prerequisites","title":"Prerequisites","text":"<ul> <li>Run:ai Administrator CLI runs on Mac and Linux.   </li> <li>Kubectl (Kubernetes command-line interface) is installed and configured to access your cluster. Please refer to https://kubernetes.io/docs/tasks/tools/install-kubectl/</li> <li>A Kubernetes configuration file obtained from a computer previously connected to the Kubernetes cluster</li> </ul>"},{"location":"admin/runai-setup/config/cli-admin-install/#kubernetes-configuration","title":"Kubernetes Configuration","text":"<p>The Run:ai Administrator CLI requires a Kubernetes profile with cluster administrative rights. </p>"},{"location":"admin/runai-setup/config/cli-admin-install/#installation","title":"Installation","text":"<p>Download the Run:ai Administrator Command-line Interface by running:</p> MacLinux <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/darwin  # (1) \nchmod +x runai-adm\nsudo mv runai-adm /usr/local/bin/runai-adm\n</code></pre> <ol> <li>In self-hosted environment, use the control-plane URL instead of <code>app.run.ai</code> </li> </ol> <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/linux  # (1)\nchmod +x runai-adm\nsudo mv runai-adm /usr/local/bin/runai-adm\n</code></pre> <ol> <li>In self-hosted environment, use the control-plane URL instead of <code>app.run.ai</code> </li> </ol> <p>To verify the installation run:</p> <pre><code>runai-adm version\n</code></pre>"},{"location":"admin/runai-setup/config/cli-admin-install/#download-a-specific-version","title":"Download a specific version","text":"<p>To download a specific version of <code>runai-adm</code> add the version number to URL. For example:</p> <pre><code>wget --content-disposition https://app.run.ai/v1/k8s/admin-cli/v2.7.22/darwin\n</code></pre>"},{"location":"admin/runai-setup/config/cli-admin-install/#updating-the-runai-administrator-cli","title":"Updating the Run:ai Administrator CLI","text":"<p>To update the CLI to the latest version perform the same install process again. The command <code>runai-adm update</code> is no longer supported.</p>"},{"location":"admin/runai-setup/config/dr/","title":"Planning for Disaster Recovery","text":"<p>The SaaS version of Run:ai moves the bulk of the burden of disaster recovery to Run:ai. Backup of data is hence not an issue in such environments. </p> <p>With the self-hosted version, it is the responsibility of the IT organization to back up data for a possible disaster and to learn how to recover when needed.</p>"},{"location":"admin/runai-setup/config/dr/#backup","title":"Backup","text":""},{"location":"admin/runai-setup/config/dr/#database","title":"Database","text":"<p>Run:ai uses an internal PostgreSQL database. The database is stored on a Kubernetes Persistent Volume (PV). You must provide a backup solution for the database. Typically by backing up the persistent volume holding the database storage.</p>"},{"location":"admin/runai-setup/config/dr/#metrics","title":"Metrics","text":"<p>Run:ai stores metric history using Thanos. Thanos is configured to store data on a persistent volume. The recommendation is to back up the PV.</p>"},{"location":"admin/runai-setup/config/dr/#additional-configuration","title":"Additional Configuration","text":"<p>During the installation of Run:ai you have created two value files:</p> <ul> <li>One for the Run:ai control plane. See Kubernetes or OpenShift,</li> <li>One for the cluster (see Kubernetes or OpenShift). </li> </ul> <p>You will want to save these files or extract a current version of the file by using the upgrade script. </p>"},{"location":"admin/runai-setup/config/dr/#recovery","title":"Recovery","text":"<p>To recover Run:ai</p> <ul> <li>Re-create the Kubernetes/OpenShift cluster.</li> <li>Recover the persistent volumes for metrics and database. </li> <li>Re-install the Run:ai control plane. Use the stored values file. If needed, modify the values file to connect to the restored PostgreSQL PV. Connect Prometheus to the stored metrics PV. </li> <li>Re-install the cluster. Use the stored values file or download a new file from the Administration UI. </li> <li>If the cluster is configured such that Projects do not create a namespace automatically, you will need to re-create namespaces and apply role bindings as discussed in Kubernetes or OpenShift.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/","title":"Node affinity with cloud node pools","text":"<p>Run:ai allows for node affinity. Node affinity is the ability to assign a Project to run on specific nodes. To use the node affinity feature, You will need to label the target nodes with the label  <code>run.ai/node-type</code>. Most cloud clusters allow configuring node labels for the node pools in the cluster. This guide shows how to apply this configuration to different cloud providers.</p> <p>To make the node affinity work with node pools on various cloud providers, we need to make sure the node pools are configured with the appropriate Kubernetes label (<code>run.ai/type=&lt;TYPE_VALUE&gt;</code>).</p>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#setting-node-labels-while-creating-a-new-cluster","title":"Setting node labels while creating a new cluster","text":"<p>You can configure node-pool labels at cluster creation time</p> GKEAKSEKS <ul> <li>At the first creation screen, you will see a menu on the left side named <code>node-pools</code>.</li> <li>Expand the node pool you want to label.</li> <li>Click on <code>Metadata</code>.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>When creating AKS cluster at the node-pools page click on create new node-pool.</li> <li>Go to the <code>labels</code> section and add key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Create a regular EKS cluster.</li> <li>Click on <code>compute</code>.</li> <li>Click on <code>Add node group</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#setting-node-labels-for-a-new-node-pool","title":"Setting node labels for a new node pool","text":"GKEAKSEKS <ul> <li>At the node pool creation screen, go to the <code>metadata</code> section.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Go to your AKS page at Azure.</li> <li>On the left menu click the <code>node-pools</code> button.</li> <li>Click on <code>Add Node Pool</code>.</li> <li>In the new Node Pool page go to <code>Optional settings</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <ul> <li>Go to <code>Add node group</code> screen.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-affinity-with-cloud-node-pools/#editing-node-labels-for-an-existing-node-pool","title":"Editing node labels for an existing node pool","text":"GKEAKSEKS <ul> <li>Go to the <code>Google Kubernetes Engine</code> page in the Google Cloud console.</li> <li>Go to <code>Google Kubernetes Engine</code>.</li> <li>In the cluster list, click the name of the cluster you want to modify.</li> <li>Click the <code>Nodes</code> tab</li> <li>Under <code>Node Pools</code>, click the name of the node pool you want to modify, then click <code>Edit</code>.</li> <li>Near the bottom, you will find the Kubernetes <code>label</code> section. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul> <p>To update an existing node pool label you must use the azure cli. Run the following command:</p> <pre><code>az aks nodepool update \\\n    --resource-group [RESOURCE GROUP] \\\n    --cluster-name [CLUSTER NAME] \\\n    --name labelnp \\\n    --labels run.ai/type=[TYPE_VALUE] \\\n    --no-wait\n</code></pre> <ul> <li>Go to the <code>node group</code> page and click on <code>Edit</code>.</li> <li>In the Kubernetes <code>labels</code> section click on <code>Add label</code>. Add the key <code>run.ai/type</code> and the value <code>&lt;TYPE_VALUE&gt;</code>.</li> </ul>"},{"location":"admin/runai-setup/config/node-roles/","title":"Designating Specific Role Nodes","text":"<p>When installing a production cluster you may want to:</p> <ul> <li>Set one or more Run:ai system nodes. These are nodes dedicated to Run:ai software. </li> <li>Machine learning frequently requires jobs that require CPU but not GPU. You may want to direct these jobs to dedicated nodes that do not have GPUs, so as not to overload these machines. </li> <li>Limit Run:ai monitoring and scheduling to specific nodes in the cluster. </li> </ul> <p>To perform these tasks you will need the Run:ai Administrator CLI. See Installing the Run:ai Administrator Command-line Interface.</p>"},{"location":"admin/runai-setup/config/node-roles/#dedicated-runai-system-nodes","title":"Dedicated Run:ai System Nodes","text":"<p>Find out the names of the nodes designated for the Run:ai system by running <code>kubectl get nodes</code>. For each such node run:</p> <pre><code>runai-adm set node-role --runai-system-worker &lt;node-name&gt;\n</code></pre> <p>If you re-run <code>kubectl get nodes</code> you will see the node role of these nodes changed to <code>runai-system</code></p> <p>To remove the runai-system node role run:</p> <pre><code>runai-adm remove node-role --runai-system-worker &lt;node-name&gt;\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a runai-system node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/config/node-roles/#dedicated-gpu-cpu-nodes","title":"Dedicated GPU &amp; CPU Nodes","text":"<p>Separate nodes into those that:</p> <ul> <li>Run GPU workloads</li> <li>Run CPU workloads</li> <li>Do not run Run:ai at all. these jobs will not be monitored using the Run:ai Administration User interface. </li> </ul> <p>Review nodes names using <code>kubectl get nodes</code>. For each such node run:</p> <pre><code>runai-adm set node-role --gpu-worker &lt;node-name&gt;\n</code></pre> <p>or </p> <pre><code>runai-adm set node-role --cpu-worker &lt;node-name&gt;\n</code></pre> <p>Nodes not marked as GPU worker or CPU worker will not run Run:ai at all.</p> <p>To set all workers not running runai-system as GPU workers run:</p> <pre><code>runai-adm set node-role --all &lt;node-name&gt;\n</code></pre> <p>To remove the CPU or GPU worker node role run:</p> <pre><code>runai-adm remove node-role --cpu-worker &lt;node-name&gt;\n</code></pre> <p>or </p> <pre><code>runai-adm remove node-role --gpu-worker &lt;node-name&gt;\n</code></pre>"},{"location":"admin/runai-setup/config/non-root-containers/","title":"User Identity in Container","text":"<p>The identity of the user in the container determines its access to resources. For example, network file storage solutions typically use this identity to determine the container's access to network volumes. This document explains multiple ways for propagating the user identity into the container.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#the-default-root-access","title":"The Default: Root Access","text":"<p>In docker, as well as in Kubernetes, the default for running containers is running as root. The implication of running as root is that processes running within the container have enough permissions to change anything in the container, and if propagated to network resources - can have permissions outside the container as well. </p> <p>This gives a lot of power to the Researcher but does not sit well with modern security standards of enterprise security. </p> <p>By default, if you run:</p> <p><pre><code>runai submit -i ubuntu --attach --interactive -- bash\n</code></pre> then run <code>id</code>, you will see the root user. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#use-runai-flags-to-limit-root-access","title":"Use Run:ai flags to limit root access","text":"<p>There are two runai submit flags which control user identity at the Researcher level:</p> <ul> <li>The flag <code>--run-as-user</code> starts the container with a specific user. The user is the current Linux user (see below for other behaviors if used in conjunction with Single sign-on). </li> <li>The flag <code>--prevent-privilege-escalation</code> prevents the container from elevating its own privileges into <code>root</code> (e.g. running <code>sudo</code> or changing system files.). </li> </ul> <p>Equivalent flags exist in the Researcher User Interface.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#run-as-current-user","title":"Run as Current User","text":"<p>From a Linux/Mac box, run:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user -- bash\n</code></pre> <p>then run <code>id</code>, you will see the users and groups of the box you have been using to launch the Job.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#prevent-escalation","title":"Prevent Escalation","text":"<p>From a Linux/Mac box, run:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user \\\n  --prevent-privilege-escalation  -- bash\n</code></pre> <p>then verify that you cannot run <code>su</code> to become root within the container. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#setting-a-cluster-wide-default","title":"Setting a Cluster-Wide Default","text":"<p>The two flags are voluntary. They are not enforced by the system. It is however possible to enforce them using Policies. Polices allow an Administrator to force compliance on both the User Interface and Command-line interface. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity","title":"Passing user identity","text":""},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity-from-identity-provider","title":"Passing user identity from Identity Provider","text":"<p>A best practice is to store the user identifier (UID) and the group identifier (GID) in the organization's directory. Run:ai allows you to pass these values to the container and use them as the container identity.</p> <p>To perform this, you must:</p> <ul> <li>Set up single sign-on. Perform the steps for UID/GID integration.</li> <li>Run: <code>runai login</code> and enter your credentials</li> <li>Use the flag --run-as-user</li> </ul> <p>Running <code>id</code> should show the identifier from the directory.</p>"},{"location":"admin/runai-setup/config/non-root-containers/#passing-user-identity-explicitly-via-the-researcher-ui","title":"Passing user identity explicitly via the Researcher UI","text":"<p>Via the Researcher User Interface, it is possible to explicitly provide the user id and group id:</p> <p></p>"},{"location":"admin/runai-setup/config/non-root-containers/#using-openshift-or-gatekeeper-to-provide-cluster-level-controls","title":"Using OpenShift or Gatekeeper to provide Cluster Level Controls","text":"<p>Run:ai supports OpenShift as a Kubernetes platform. In OpenShift the system will provide a random UID to containers. The flags <code>--run-as-user</code> and <code>--prevent-privilege-escalation</code> are disabled on OpenShift. It is possible to achieve a similar effect on Kubernetes systems that are not OpenShift. A leading tool is Gatekeeper. Gatekeeper similarly enforces non-root on containers at the system level. </p>"},{"location":"admin/runai-setup/config/non-root-containers/#creating-a-temporary-home-directory","title":"Creating a Temporary Home Directory","text":"<p>When containers run as a specific user, the user needs to have a pre-created home directory within the image. Otherwise, when running a shell, you will not have a home directory:</p> <pre><code>runai submit -i ubuntu --attach --interactive --run-as-user -- bash\nThe job 'job-0' has been submitted successfully\nYou can run `runai describe job job-0 -p team-a` to check the job status\nWaiting for pod to start running...\nINFO[0007] Job started\nConnecting to pod job-0-0-0\nIf you don't see a command prompt, try pressing enter.\nI have no name!@job-0-0-0:/$ </code></pre> <p>Adding home directories to an image per user is not a viable solution. To overcome this, Run:ai provides an additional flag <code>--create-home-dir</code>. Adding this flag creates a temporary home directory for the user within the container.  </p> <p>Notes</p> <ul> <li>Data saved in this directory will not be saved when the container exits. </li> <li>This flag is set by default to true when the <code>--run-as-user</code> flag is used, and false if not.</li> </ul>"},{"location":"admin/runai-setup/config/overview/","title":"Run:ai Configuration Articles","text":"<p>This section provides a list of installation-related articles dealing with a wide range of subjects:</p> Article Purpose Designating Specific Role Nodes Set one or more designated Run:ai system nodes or limit Run:ai monitoring and scheduling to specific nodes in the cluster. Setup Project-based Researcher Access Control Enable  Run:ai access control is at the Project level. Single sign-on Integrate with the organization's Identity Provider to provide single sign-on for Run:ai Review Kubernetes Access provided to Run:ai In Restrictive Kubernetes environments such as when using OpenShift, understand and control what Kubernetes roles are provided to Run:ai External access to Containers Understand the available options for Researchers to access containers from the outside User Identity in Container The identity of the user in the container determines its access to cluster resources. The document explains multiple way on how to propagate the user identity into the container. Install the Run:ai Administrator Command-line Interface The Administrator command-line is useful in a variety of flows such as cluster upgrade, node setup etc."},{"location":"admin/runai-setup/maintenance/audit-log/","title":"Audit Log","text":""},{"location":"admin/runai-setup/maintenance/audit-log/#introduction","title":"Introduction","text":"<p>The Run:ai control plane provides audit log API and audit log user interface table. Both reflect the same information:</p> <ul> <li>All changes to business objects</li> <li>All logins to the control plane.</li> </ul>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-audit-log-user-interface","title":"Event History - Audit Log User Interface","text":"<p>The Administrators of the system can view the audit log using the user interface. The audit log screen is under the 'Event History' section:</p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-audit-log-information-fields","title":"Event History (audit log) information fields","text":"<p>The Administrator can choose what information fields to view within the audit log table, this is done by clicking the 'Columns' button and checking the required fields to be presented:</p> <p></p> <p></p> <p>Here's the list of available information fields in the Event History (audit log) table:</p> Field Type Description User/App user id The identity of the User or Application that executed this operation. Data &amp; Time date The exact timestamp at which the event occured.  Format <code>dd/mm/yyyy</code> for date and <code>hh:mm am/pm</code> for time. Event event type The type of the logged operation. Possible values: <code>Create</code>, <code>Update</code>, <code>Delete</code>, <code>Login</code>. Event ID integer Sequanicialy incrmental number of the logged operation, lower number means older event, higher means newer event. Status string The outcome of the logged operation. Possible values: <code>Succeeded</code>, <code>Failed</code>. Entity type string The type of the logged business object. Possible values: <code>Project</code>, <code>Department</code>, <code>User</code>, <code>Group</code>, <code>Login</code>, <code>Settings</code>, <code>Applications</code>, <code>Node Pool</code>. Entity name string The name of logged business object. Entity ID string The system's internal id of the logged business object. Cluster Name string The name of the cluster that the loged operation relates to. If the operation is not cluster specific - cluster name remains empty. Cluster ID string The system internal identifier of the cluster that the loged operation relates to. If the operation is not cluster specific - cluster id remains empty."},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-date-selector","title":"Event History - Date Selector","text":"<p>The Event History table saves logged operations for the last 90 days. However, the table itself presents up to the last 30 days of information due to the potentially very high number of operations that might be logged during this period. To view older logged operations, or if you wish to refine your search and get more specific results or fewer results, you should use the time selector and change the period you search for. You can also refine your search by using filters as explained below.  </p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-filters","title":"Event History - Filters","text":"<p>The administrator can choose to filter the table using a list of predefined filters. The filter's value is a free text keyword entered by the administrator and must be fully matched to the requested field's actual value, otherwise, the filter will not find the requested keyword. Multiple filters can be set in parallel.</p> <p></p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#event-history-download-the-audit-log-file","title":"Event History - Download the Audit Log file","text":"<p>The event history table allows you to download the logged information in text form formatted as CSV or JSON files. The scope of the downloaded information is set by the scope of the table filters, i.e. if no filters or date selectors are used, the downloaded file includes the full scope of the information that the table holds - i.e. up to 30 days of logged information. To view older logged information (up to 90 days older, but no more than 30 days at a time), shorter periods, or narrower (filtered) scopes - use the date selector and filters.</p> <p></p>"},{"location":"admin/runai-setup/maintenance/audit-log/#audit-log-api","title":"Audit log API","text":"<p>Since the amount of data is not trivial, the API is based on paging in the sense that it will retrieve a specified number of items for each API call. You can get more data by using subsequent calls. </p>"},{"location":"admin/runai-setup/maintenance/audit-log/#retrieve-audit-log-data-via-api","title":"Retrieve Audit Log data via API","text":"<p>To retrieve the Audit log you need to call an API. You can do this via code or by using the Audit function via a user interface for calling APIs.</p>"},{"location":"admin/runai-setup/maintenance/audit-log/#retrieve-via-code","title":"Retrieve via Code","text":"<p>Create an Application and generate a bearer token by following the API Authentication document.  </p> <p>To get the first 40 records of the audit log starting January 1st, 2022, run:</p> <pre><code>curl -X 'GET' \\\n'https://&lt;COMPANY-URL&gt;/v1/k8s/audit?start=2022-1-1' \\  # (1)\n-H 'accept: application/json' \\\n-H 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' # (2)\n</code></pre> <ol> <li><code>&lt;COMPANY-URL&gt;</code> is <code>app.run.ai</code> for SaaS installations (not <code>&lt;company&gt;.run.ai</code>) or the Run:ai user interface URL for Self-hosted installations.</li> <li>To obtain a Bearer token see API authentication.</li> </ol> <p>Sample result:</p> <pre><code>[\n{\n\"id\": 3,\n\"tenantId\": 1,\n\"happenedAt\": \"2022-07-07T09:45:32.069Z\",\n\"action\": \"Update\",\n\"version\": \"1.0\",\n\"entityId\": \"1\",\n\"entityType\": \"Project\",\n\"entityName\": \"team-a\",\n\"sourceType\": \"User\",\n\"sourceId\": \"a79500fb-c452-471f-adc0-b65c972bd5c2\",\n\"sourceName\": \"test@run.ai\",\n\"context\": {\n\"user_agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36\",\n\"ip_address\": \"10.244.0.0\"\n}\n},\n{\n\"id\": 2,\n\"tenantId\": 1,\n\"happenedAt\": \"2022-07-07T08:27:39.649Z\",\n\"action\": \"Create\",\n\"version\": \"1.0\",\n\"entityId\": \"fdc90aab-b183-4856-8337-14039063b876\",\n\"entityType\": \"App\",\n\"entityName\": \"admin\",\n\"sourceType\": \"User\",\n\"sourceId\": \"a79500fb-c452-471f-adc0-b65c972bd5c2\",\n\"sourceName\": \"test@run.ai\",\n\"context\": {\n\"user_agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36\",\n\"ip_address\": \"10.244.0.0\"\n}\n},\n...\n]\n</code></pre>"},{"location":"admin/runai-setup/maintenance/audit-log/#paging","title":"Paging","text":"<p>Use the <code>limit</code> and <code>offset</code> properties to retrieve all audit log entries.</p>"},{"location":"admin/runai-setup/maintenance/audit-log/#additional-filter","title":"Additional filter","text":"<p>You can add additional filters to the query as follows:</p> Field Type Description start date Start date for audit logs retrieval.  Format <code>yyyy-MM-dd</code> for date or <code>yyyy-MM-ddThh:mm:ss</code> for date-time. end date End date for audit logs retrieval.  Format <code>yyyy-MM-dd</code> for date or <code>yyyy-MM-ddThh:mm:ss</code> for date-time. action string The action of the logged operation. Possible values: <code>Create</code>, <code>Update</code>, <code>Delete</code>, <code>Login</code> source_type string The initiator of the action (user or machine to machine key). Possible values: <code>User</code>, <code>Application</code> source_id string The id of the source of the action. For <code>User</code>, this is the internal user id. For an <code>Application</code>, this is the internal id of the Application source_name string The name of the source of the action. For a <code>User</code>, this is the user's email, for an <code>Application</code>, this is the Application name. entity_type string The type of business object. Possible values: <code>Project</code>, <code>Department</code>, <code>User</code>, <code>Group</code>, <code>Login</code>, <code>Settings</code>, <code>Applications</code> entity_id string The id of the business object limit integer Paging: the number of records to fetch at once (default is 40 record) offset integer Paging: The offset from which to start fetching records. success string enter true for successful audit log records and false for failures (default is all records) download string enter true to download the logs into a file <p></p>"},{"location":"admin/runai-setup/maintenance/monitoring/","title":"Cluster Monitoring","text":""},{"location":"admin/runai-setup/maintenance/monitoring/#introduction","title":"Introduction","text":"<p>Organizations typically want to automatically highlight critical issues and escalate issues to IT/DevOps personnel. The standard practice is to install an alert management tool and connect it to critical systems. </p> <p>Run:ai is comprised of two parts:</p> <ul> <li>A control plane part, typically resides in the cloud. The health of the cloud portion of Run:ai can be viewed at status.run.ai. In Self-hosted installations of Run:ai is installed on-prem.</li> <li>One or more GPU Clusters. </li> </ul> <p>The purpose of this document is to configure the Run:ai to emit health alerts and to connect these alerts to alert-management systems within the organization. </p> <p>Alerts are emitted for Run:ai Clusters as well as the Run:ai control plane on Self-hosted installation where the control plane resides on the same Kubernetes cluster as one of the Run:ai clusters. </p>"},{"location":"admin/runai-setup/maintenance/monitoring/#alert-infrastructure","title":"Alert Infrastructure","text":"<p>Run:ai uses Prometheus for externalizing metrics. The Run:ai cluster installation installs Prometheus or can connect to an existing Prometheus instance used in the organization.  Run:ai cluster alerts are based on the Prometheus Alert Manager. The Prometheus Alert Manager is enabled by default.  </p> <p>This document explains, how to:</p> <ul> <li>Configure alert destinations. Triggered alerts will send data to destinations.  </li> <li>Understand the out-of-the-box cluster alerts. </li> <li>Advanced: add additional custom alerts. </li> </ul>"},{"location":"admin/runai-setup/maintenance/monitoring/#configure-alert-destinations","title":"Configure Alert Destinations","text":"<p>Prometheus Alert Manager provides a structured way to connect to alert-management systems. Configuration details are here. There are built-in plugins for popular systems such as PagerDuty and OpsGenie, including a generic webhook. </p> <p>Following is an example showing how to integrate Run:ai to a webhook:</p> <ul> <li>Use https://webhook.site/. Get the <code>Unique URL</code>.</li> <li>When installing the Run:ai cluster, edit the values file to add the following.</li> </ul> <pre><code>kube-prometheus-stack:\n...\nalertmanager:\nenabled: true\nconfig:\nglobal:\nresolve_timeout: 5m\nreceivers:\n- name: \"null\"\n- name: webhook-notifications\nwebhook_configs:\n- url: &lt;WEB-HOOK-URL&gt;\nsend_resolved: true\nroute:\ngroup_by:\n- alertname\ngroup_interval: 5m\ngroup_wait: 30s\nreceiver: 'null'\nrepeat_interval: 10m\nroutes:\n- receiver: webhook-notifications\n</code></pre> <p>(Replace <code>&lt;WEB-HOOK-URL&gt;</code> with the URL above).</p> <ul> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>Verify that you have received alerts at https://webhook.site/.</li> </ul>"},{"location":"admin/runai-setup/maintenance/monitoring/#out-of-the-box-alerts","title":"Out-of-the-box Alerts","text":"<p>A Run:ai cluster comes with several built-in alerts. Each alert tests a specific aspect of the Run:ai functionality. In addition, there is a single, inclusive alert, which aggregates all component-based alerts into a single cluster health test.</p> <p>The aggregated alert is named <code>RunaiCriticalProblem</code>. It is categorized as \"critical\".</p>"},{"location":"admin/runai-setup/maintenance/monitoring/#add-a-custom-alert","title":"Add a custom alert","text":"<p>You can add additional alerts from Run:ai. Alerts are triggered by using the Promtheus query language with any Run:ai metric. To add new alert:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> <li>Add an alert according to the structure specified below.</li> </ul> <p>Add more alerts with the following structure:</p> <pre><code>kube-prometheus-stack:\nadditionalPrometheusRulesMap:\ncustom-runai:\ngroups:\n- name: custom-runai-rules\nrules:\n- alert: &lt;ALERT-NAME&gt;\nannotations:\nsummary: &lt;ALERT-SUMMARY-TEXT&gt;\nexpr:  &lt;PROMQL-EXPRESSION&gt;\nfor: &lt;optional: duration s/m/h&gt;\nlabels:\nseverity: &lt;critical/warning&gt;\n</code></pre> <p>You can find an example in the Prometheus documentation here.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/","title":"Planned and Unplanned Node Downtime","text":""},{"location":"admin/runai-setup/maintenance/node-downtime/#introduction","title":"Introduction","text":"<p>Nodes (Machines) that are part of the cluster are susceptible to occasional downtime. This can be either as part of planned maintenance where we bring down the node for a specified time in an orderly fashion or an unplanned downtime where the machine abruptly stops due to a software or hardware issue.</p> <p>The purpose of this document is to provide a process for retaining the Run:ai service and Researcher workloads during and after the downtime. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#self-hosted-installation","title":"Self-hosted installation","text":"<p>The self-hosted installation differs from the Classic (SaaS) installation of Run:ai in that it includes the Run:ai control-plane. The control plane contains data that must be preserved during downtime. As such, you must first follow the disaster recovery planning process. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#node-types","title":"Node Types","text":"<p>The document differentiates between Run:ai System Worker Nodes and GPU Worker Nodes:</p> <ul> <li>Worker Nodes - are where Machine Learning workloads run. </li> <li>Run:ai System Nodes - In a production installation Run:ai software runs on one or more Run:ai System Nodes on which the Run:ai software runs. </li> </ul>"},{"location":"admin/runai-setup/maintenance/node-downtime/#worker-nodes","title":"Worker Nodes","text":"<p>Worker Nodes are where machine learning workloads run. Ideally, when a node is down, whether for planned maintenance or due to an abrupt downtime, these workloads should migrate to other available nodes or wait in the queue to be started when possible. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#training-vs-interactive","title":"Training vs. Interactive","text":"<p>Run:ai differentiates between Training and Interactive workloads. The key difference at node downtime is that Training workloads will automatically move to a new node while Interactive workloads require a manual process. The manual process is recommended for Training workloads as well, as it hastens the process -- it takes time for Kubernetes to identify that a node is down.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#planned-maintenance","title":"Planned Maintenance","text":"<p>Before stopping a Worker node, perform the following: </p> <ul> <li>Stop the Kubernetes scheduler from starting new workloads on the node and drain node from all existing workloads. Workloads will move to other nodes or await on queue for renewed execution:</li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute\n</code></pre> <ul> <li> <p>Shut down the node and perform the required maintenance. </p> </li> <li> <p>When done, start the node and then run:</p> </li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute-\n</code></pre>"},{"location":"admin/runai-setup/maintenance/node-downtime/#unplanned-downtime","title":"Unplanned Downtime","text":"<ul> <li> <p>If a node has failed and has immediately restarted, all services will automatically start. </p> </li> <li> <p>If a node is to remain down for some time, you will want to drain the node so that workloads will migrate to another node:</p> </li> </ul> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute\n</code></pre> <p>When the node is up again, run: </p> <pre><code>kubectl taint nodes &lt;node-name&gt; runai=drain:NoExecute-\n</code></pre> <ul> <li>If the node is to be permanently shut down, you can remove it completely from Kubernetes. Run:</li> </ul> <pre><code>kubectl delete node &lt;node-name&gt;\n</code></pre> <p>However, if you plan to bring back the node, you will need to rejoin the node into the cluster. See Rejoin.</p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#runai-system-nodes","title":"Run:ai System Nodes","text":"<p>In a production installation, Run:ai software runs on one or more Run:ai system nodes. As a best practice, it's best to have more than one such node so that during planned maintenance or unplanned downtime of a single node, the other node will take over. If a second node does not exist, you will have to designate an arbitrary node on the cluster as a Run:ai system node to complete the process below.</p> <p>Protocols for planned maintenance and unplanned downtime are identical to Worker Nodes. See the section above. </p>"},{"location":"admin/runai-setup/maintenance/node-downtime/#rejoin-a-node-into-the-kubernetes-cluster","title":"Rejoin a Node into the Kubernetes Cluster","text":"<p>To rejoin a node to the cluster follow the following steps:</p> <ul> <li>On the master node, run:</li> </ul> <p><pre><code>kubeadm token create --print-join-command\n</code></pre> * This would output a <code>kubeadm join</code> command. Run the command on the worker node for it to re-join the Kubernetes cluster.  * Verify that the node is joined by running:</p> <pre><code>kubectl get nodes\n</code></pre> <ul> <li>When the machine is up you will need to re-label nodes according to their role</li> </ul>"},{"location":"admin/runai-setup/self-hosted/overview/","title":"Self Hosted Run:ai Installation","text":"<p>The self-hosted option is for organizations that cannot use a SaaS solution due to data leakage concerns.</p> <p>Run:ai self-hosting comes with two variants:</p> Self-hosting Type Description Connected The organization can freely download from the internet (though upload is not allowed) Air-gapped The organization has no connection to the internet  <p>The self-hosted installation is priced differently. For further information please talk to Run:ai sales. </p>"},{"location":"admin/runai-setup/self-hosted/overview/#self-hosting-with-kubernetes-vs-openshift","title":"Self-hosting with Kubernetes vs OpenShift","text":"<p>Kubernetes has many Certified Kubernetes Providers. Run:ai has been installed with a number of those such as Rancher, OpenShift, HPE Ezmeral, and Native Kubernetes. The OpenShift installation is different from the rest. As such, the Run:ai self-hosted installation instructions are divided into two separate sections:</p> <ul> <li>OpenShift-based installation. See Run:ai OpenShift installation. The Run:ai operator for OpenShift is certified by Red Hat.</li> <li>Kubernetes-based installation. See Run:ai Kubernetes installation.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/","title":"Installing additional Clusters","text":"<p>The first Run:ai cluster is typically installed on the same Kubernetes cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different Kubernetes clusters.</p> <p>The instructions are for Run:ai version 2.8 and up.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/additional-clusters/#installation","title":"Installation","text":"<p>Follow the Run:ai SaaS installation network instructions as described here.  Specifically:</p> <ol> <li>Install Run:ai prerequisites. Including ingress controller and Prometheus. </li> <li>The Cluster should have a dedicated URL with a trusted certificate.</li> <li>Create a secret in the Run:ai namespace containing the details of a trusted certificate.  </li> </ol> <p>Create a new cluster and download a values file. Perform the following changes in the file:</p> <ul> <li>Under: <code>runai-operator.config.global</code> set <code>clusterDomain</code> to the domain name of the new cluster.</li> <li>Under <code>runai-operator.config.researcher-service</code> set <code>ingress</code> to <code>true</code>.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/k8s/backend/#domain-certificate","title":"Domain certificate","text":"<p>You must provide the domain's private key and crt as a Kubernetes secret in the <code>runai-backend</code> namespace. Run: </p> <pre><code>kubectl create secret tls runai-backend-tls -n runai-backend \\\n    --cert /path/to/fullchain.pem --key /path/to/private.pem\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#install-the-control-plane","title":"Install the Control Plane","text":"<p>Run the helm command below:</p> ConnectedAirgapped <pre><code>helm repo add runai-backend https://backend-charts.storage.googleapis.com\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\\n--set global.domain=&lt;DOMAIN&gt;  # (1)\n</code></pre> <ol> <li>Domain name described here. </li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-backend</code>.</p> <pre><code>helm upgrade -i runai-backend control-plane-&lt;VERSION&gt;.tgz  \\ # (1)\n--set global.domain=&lt;DOMAIN&gt;  # (2)\n-n runai-backend -f custom-env.yaml  # (3)\n</code></pre> <ol> <li>Replace <code>&lt;VERSION&gt;</code> with the Run:ai control plane version.</li> <li>Domain name described here. </li> <li><code>custom-env.yaml</code> should have been created by the prepare installation script in the previous section. </li> </ol> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#optional-additional-configurations","title":"(Optional) Additional Configurations","text":"<p>There may be cases where you need to set additional properties as follows:</p> Key Change Description <code>keycloakx.adminUser</code> User name of the internal identity provider administrator This user is the administrator of Keycloak <code>keycloakx.adminPassword</code> Password of the internal identity provider administrator This password is for the administrator of Keycloak <code>global.ingress.ingressClass</code> Ingress class Run:ai default is using NGINX. If your cluster has a different ingress controller, you can configure the ingress class to be created by Run:ai <code>global.ingress.tlsSecretName</code> TLS secret name Run:ai requires the creation of a secret with domain certificate. See above. If the <code>runai-backend</code> namespace already had such a secret, you can set the secret name here <code>global.postgresql.auth.username</code> PostgreSQL username Override the Run:ai default user name for the Run:ai database <code>global.postgresql.auth.password</code> PostgreSQL password Override the Run:ai default password for the Run:ai database <code>grafana.adminUser</code> Grafana username Override the Run:ai default user name for accessing Grafana <code>grafana.adminPassword</code> Grafana password Override the Run:ai default password for accessing Grafana <code>thanos.receive.persistence.storageClass</code> and <code>postgresql.primary.persistence.storageClass</code> Storage class The installation to work with a specific storage class rather than the default one <code>global.imagePullSecrets:</code> <code>- name: &lt;secret-name&gt;</code> Docker secret Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments <code>&lt;component&gt;</code> <code>resources:</code> <code>limits:</code> <code>cpu: 500m</code> <code>memory: 512Mi</code> <code>requests:</code> <code>cpu: 250m</code> <code>memory: 256Mi</code> Pod request and limits <code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code> <p>Use the <code>--set</code> syntax in the helm command above.  </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User Interface","text":"<p>Go to: <code>runai.&lt;company-name&gt;</code>. Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. Go to the Users area and change the password. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#optional-enable-forgot-password","title":"(Optional) Enable \"Forgot password\"","text":"<p>To support the \u201cForgot password\u201d functionality, follow the steps below.</p> <ul> <li>Go to <code>runai.&lt;company-name&gt;/auth</code> and Log in. </li> <li>Under <code>Realm settings</code>, select the <code>Login</code> tab and enable the <code>Forgot password</code> feature.</li> <li>Under the <code>Email</code> tab, define an SMTP server, as explained here</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/backend/#next-steps","title":"Next Steps","text":"<p>Continue with installing a Run:ai Cluster.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/","title":"Self Hosted installation over Kubernetes - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#prerequisites","title":"Prerequisites","text":"<p>Install prerequisites as per cluster prerequisites document.  </p>"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#customize-installation","title":"Customize Installation","text":"<ul> <li>Perform the cluster installation instructions explained here. </li> <li>(Optional) make the following changes to the configuration file you have downloaded:</li> </ul> Key Default Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create namespaces, or would want to create namespaces manually rather than use the Run:ai convention of <code>runai-&lt;PROJECT-NAME&gt;</code>. When set to <code>false</code>, will require an additional manual step when creating new Run:ai Projects. <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create Kubernetes Secrets. When not enabled, automatic secret propagation will not be available <code>runai-operator.config.mps-server.enabled</code> <code>false</code> Allow the use of NVIDIA MPS. MPS is useful with Inference workloads. Requires extra cluster permissions  <code>runai-operator.config.runai-container-toolkit.enabled</code> <code>true</code> Controls the usage of Fractions. Requires extra cluster permissions  <code>runai-operator.config.global.runtime</code> <code>docker</code> Defines the container runtime of the cluster (supports <code>docker</code> and <code>containerd</code>). Set to <code>containerd</code> when using Tanzu <code>runai-operator.config.runaiBackend.password</code> Default password already set admin@run.ai password. Need to change only if you have changed the password here"},{"location":"admin/runai-setup/self-hosted/k8s/cluster/#install-cluster","title":"Install Cluster","text":"<p>Run:</p> ConnectedAirgapped <pre><code>helm repo add runai https://run-ai-charts.storage.googleapis.com\nhelm repo update\n\nhelm install runai-cluster runai/runai-cluster -n runai \\\n    -f runai-&lt;cluster-name&gt;.yaml --create-namespace\n</code></pre> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p> <pre><code>helm install runai-cluster -n runai  \\ \n  runai-cluster-&lt;version&gt;.tgz -f runai-&lt;cluster-name&gt;.yaml --create-namespace\n</code></pre> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. For more details see Understanding cluster access roles.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/next-steps/","title":"Next Steps","text":"<ul> <li>Create additional I Users.</li> <li>Set up Project-based Researcher Access Control.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenace scenarios.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/","title":"Self-Hosted Installation over Kubernetes - Preparations","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prerequisites","title":"Prerequisites","text":"<p>See the Prerequisites section above.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#prepare-installation-artifacts","title":"Prepare Installation Artifacts","text":""},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#runai-software-files","title":"Run:ai Software Files","text":"<p>SSH into a node with <code>kubectl</code> access to the cluster and <code>Docker</code> installed.</p> ConnectedAirgapped <p>Run the following to enable image download from the Run:ai Container Registry on Google cloud:</p> <pre><code>kubectl create namespace runai-backend\nkubectl apply -f runai-gcr-secret.yaml\n</code></pre> <p>To extract Run:ai files, replace <code>&lt;VERSION&gt;</code> in the command below and run: </p> <pre><code>tar xvf runai-air-gapped-&lt;version&gt;.tar.gz\ncd deploy\n\nkubectl create namespace runai-backend\n</code></pre> <p>Upload images</p> <p>Upload images to a local Docker Registry. Set the Docker Registry address in the form of <code>NAME:PORT</code> (do not add <code>https</code>):</p> <pre><code>export REGISTRY_URL=&lt;Docker Registry address&gt;\n</code></pre> <p>Run the following script (you must dockerd installed and at least 20GB of free disk space to run): </p> <pre><code>sudo -E ./prepare_installation.sh\n</code></pre> <p>If Docker is configured to run as non-root then <code>sudo</code> is not required.</p> <p>The script should create a file named <code>custom-env.yaml</code> which will be used by the control-plane installation.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#optional-mark-runai-system-workers","title":"(Optional) Mark Run:ai System Workers","text":"<p>You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.  </p> <p>To set system worker nodes run:</p> <pre><code>kubectl label node &lt;NODE-NAME&gt; node-role.kubernetes.io/runai-system=true\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a <code>runai-system</code> node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#additional-permissions","title":"Additional Permissions","text":"<p>As part of the installation, you will be required to install the Run:ai Control Plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the <code>--dry-run</code> on both helm charts. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/preparations/#next-steps","title":"Next Steps","text":"<p>Continue with installing the Run:ai Control Plane.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/","title":"Prerequisites","text":"<p>Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#control-plane-and-clusters","title":"Control-plane and clusters","text":"<p>As part of the installation process you will install:</p> <ul> <li>A control-plane managing cluster</li> <li>One or more Run:ai clusters</li> </ul> <p>Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>See Cluster prerequisites hardware requirements.</p> <p>In addition, the control plane installation of Run:ai requires the configuration of Kubernetes Persistent Volumes of a total size of 110GB. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software","title":"Run:ai Software","text":"ConnectedAirgapped <p>You should receive a file: <code>runai-gcr-secret.yaml</code> from Run:ai Customer Support. The file provides access to the Run:ai Container registry.</p> <p>You should receive a single file <code>runai-air-gapped-&lt;version&gt;.tar.gz</code> from Run:ai customer support</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#runai-software-prerequisites","title":"Run:ai Software Prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#operating-system","title":"Operating System","text":"<p>See Run:ai Cluster prerequisites operating system requirements.</p> <p>The Run:ai control plane operating system prerequisites are identical.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#kubernetes","title":"Kubernetes","text":"<p>See Run:ai Cluster prerequisites Kubernetes requirements.</p> <p>The Run:ai control plane operating system prerequisites are identical.</p> <p>The Run:ai control-plane requires a default storage class to create persistent volume claims for Run:ai storage. The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the Run:ai persistent data is saved or deleted when the Run:ai control plane is deleted.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#nvidia-prerequisites","title":"NVIDIA Prerequisites","text":"<p>See Run:ai Cluster prerequisites NVIDIA requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#prometheus-prerequisites","title":"Prometheus Prerequisites","text":"<p>See Run:ai Cluster prerequisites Prometheus requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Prometheus prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#optional-inference-prerequisites","title":"(Optional) Inference Prerequisites","text":"<p>See Run:ai Cluster prerequisites Inference requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#helm","title":"Helm","text":"<p>Run:ai requires Helm. To install Helm, see https://helm.sh/docs/intro/install/. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#network-requirements","title":"Network Requirements","text":""},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#ingress-controller","title":"Ingress Controller","text":"<p>The Run:ai control plane installation assumes an existing installation of NGINX as the ingress controller. You can follow the Run:ai Cluster prerequisites ingress controller installation.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#domain-name","title":"Domain name","text":"<p>The Run:ai control plane requires a domain name (FQDN). You must supply a domain name as well as a trusted certificate for that domain. </p> <ul> <li>When installing the first Run:ai cluster on the same Kubernetes cluster as the control plane, the Run:ai cluster URL will be the same as the control-plane URL.</li> <li>When installing the Run:ai cluster on a separate Kubernetes cluster, follow the Run:ai domain name requirements. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#installer-machine","title":"Installer Machine","text":"<p>The machine running the installation script (typically the Kubernetes master) must have:</p> <ul> <li>At least 50GB of free space.</li> <li>Docker installed.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#other","title":"Other","text":"<ul> <li>(Airgapped installation only)  Private Docker Registry. Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <code>&lt;REGISTRY_URL&gt;</code>). </li> <li>(Optional) SAML Integration as described under single sign-on. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyze their relevance to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt; --domain &lt;dns-entry&gt;\n</code></pre> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/project-management/","title":"Self Hosted installation over Kubernetes - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#introduction","title":"Introduction","text":"<p>The Administrator creates Run:ai Projects via the Run:ai user interface. When enabling Researcher Authentication you also assign users to Projects.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:</p> <ol> <li>Creates a namespace by the name of <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Labels the namespace as managed by Run:ai.</li> <li>Provides access to the namespace for Run:ai services.</li> <li>Associates users with the namespace. </li> </ol> <p>This process may need to be altered if,</p> <ul> <li>Researchers already have existing Kubernetes namespaces</li> <li>The organization's Kubernetes namespace naming convention does not allow the <code>runai-</code> prefix. </li> <li>The organization's policy does not allow the automatic creation of namespaces.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/project-management/#process","title":"Process","text":"<p>Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:</p> <ul> <li>When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag <code>createNamespaces</code> to <code>false</code>.</li> <li>Using the Run:ai User Interface, create a new Project <code>&lt;PROJECT-NAME&gt;</code>. A namespace will not be created. </li> <li>Associate and existing namepace <code>&lt;NAMESPACE&gt;</code> with the Run:ai project by running:</li> </ul> <pre><code>kubectl label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;\n</code></pre> <p>Caution</p> <p>Setting the <code>createNamespaces</code> flag to <code>false</code> moves the responsibility of creating namespaces to match Run:ai Projects to the administrator. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/","title":"Uninstall Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-a-runai-cluster","title":"Uninstall a Run:ai Cluster","text":"<p>To uninstall the cluster see: cluster delete </p>"},{"location":"admin/runai-setup/self-hosted/k8s/uninstall/#uninstall-the-runai-control-plane","title":"Uninstall the Run:ai Control Plane","text":"<p>To delete the control plane, run:</p> <pre><code>helm delete runai-backend -n runai-backend\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/","title":"Upgrade Run:ai","text":"<p>Important</p> <p>Run:ai data is stored in Kubernetes persistent volumes (PVs). Prior to Run:ai 2.12, PVs are owned by the Run:ai installation. Thus, uninstalling the <code>runai-backend</code> helm chart may delete all of your data. </p> <p>From version 2.12 forward, PVs are owned the customer and are independent of the Run:ai installation. As such, they are subject to storage class reclaim policy.</p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#preparations","title":"Preparations","text":"ConnectedAirgapped <p>No preparation required.</p> <ul> <li>Ask for a tar file <code>runai-air-gapped-&lt;new-version&gt;.tar</code> from Run:ai customer support. The file contains the new version you want to upgrade to. <code>new-version</code> is the updated version of the Run:ai control plane.</li> <li>Upload the images as described here.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#specific-version-instructions","title":"Specific version instructions","text":""},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-27-or-28","title":"Upgrade from Version 2.7 or 2.8","text":"<p>Before upgrading the control plane, run: </p> <pre><code>POSTGRES_PV=$(kubectl get pvc pvc-postgresql -n runai-backend -o jsonpath='{.spec.volumeName}')\nTHANOS_PV=$(kubectl get pvc pvc-thanos-receive -n runai-backend -o jsonpath='{.spec.volumeName}')\nkubectl patch pv $POSTGRES_PV $THANOS_PV -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Retain\"}}'\nkubectl delete secret -n runai-backend runai-backend-postgresql\nkubectl delete sts -n runai-backend keycloak runai-backend-postgresql\n</code></pre> <p>Before version 2.9, the Run:ai installation, by default, included NGINX. It was possible to disable this installation. If NGINX is enabled in your current installation, as per the default, run the following 2 lines:</p> <p><pre><code>kubectl delete ValidatingWebhookConfiguration runai-backend-nginx-ingress-admission\nkubectl delete ingressclass nginx </code></pre> (If Run:ai configuration has previously disabled NGINX installation then these lines should not be run).</p> <p>Next, install NGINX as described here</p> <p>Then create a TLS secret and upgrade the control plane as described in the control plane installation. Before upgrading, find customizations and merge them as discussed below. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-from-version-29-210-211","title":"Upgrade from version 2.9, 2.10, 2.11","text":"<p>Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization. </p>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#pvc-ownership","title":"PVC Ownership","text":"<p>Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, </p> <ul> <li>Run:ai requires a Kubernetes storage class to be installed.</li> <li>The PVCs are created by the Kubernetes StatefulSets. </li> </ul> <p>The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.  </p> <p>To remove the ownership in an older installation, run:</p> <pre><code>kubectl patch pvc -n runai-backend pvc-thanos-receive  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\nkubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#ingress","title":"Ingress","text":"<p>Delete the ingress object which will be recreated by the control plane upgrade</p> <pre><code>kubectl delete ing -n runai-backend runai-backend-ingress\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#installation-customization","title":"Installation Customization","text":"<p>The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard <code>--set</code> flags. If you have previously customized the installation, you must now extract these customizations and add them as <code>--set</code> flag to the helm installation:</p> <ul> <li>Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here <code>https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh</code>. For information on how to use this utility please contact Run:ai customer support. </li> <li>Search for the customizations you found in the optional configurations table and add them in the new format. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-control-plane","title":"Upgrade Control Plane","text":"<ul> <li>Create a <code>tls secret</code> as described in the control plane installation. </li> <li>Upgrade the control plane as described in the control plane installation. During the upgrade, you must tell the installation not to create the two PVCs:</li> </ul> <pre><code>helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\\n    --set global.domain=&lt;DOMAIN&gt; \\\n    --set=postgresql.primary.persistence.existingClaim=pvc-postgresql \\ \n    --set=thanos.receive.persistence.existingClaim=pvc-thanos-receive \n</code></pre>"},{"location":"admin/runai-setup/self-hosted/k8s/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"ConnectedAirgapped <p>To upgrade the cluster follow the instructions here.</p> <p><pre><code>helm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster -n runai runai-cluster-&lt;version&gt;.tgz -f values.yaml\n</code></pre> (replace <code>&lt;version&gt;</code> with the cluster version)</p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/","title":"Installing additional Clusters","text":"<p>The first Run:ai cluster is typically installed on the same OpenShift cluster as the Run:ai control plane. Run:ai supports multiple clusters per single control plane. This document is about installing additional clusters on different OpenShift clusters.</p> <p>The instructions are for Run:ai version 2.13 and up.</p> <p>Limitation</p> <p>When you log in, you do so in the context of a specific cluster. When you switch to a different cluster, you will be prompted to log in to that cluster. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#configuration","title":"Configuration","text":"<p>The exact configuration details must be worked together with Run:ai customer support. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#additional-cluster-installation","title":"Additional Cluster Installation","text":"<p>Create a new cluster, then:</p> <ul> <li>Select a target platform <code>OpenShift</code> </li> <li>Select a Cluster location <code>Remote to Control Plane</code>.</li> <li>You must enter a specific cluster URL with the format <code>https://runai.apps.&lt;BASE_DOMAIN&gt;</code>. To get the base Domain run <code>oc get dns cluster -oyaml | grep baseDomain</code></li> <li>Ignore the instructions for creating a secret.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/additional-clusters/#login","title":"Login","text":"<p>When configured, you will see an option to choose a cluster at the bottom of the login screen:</p> <p></p>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/","title":"Install the Run:ai Control Plane","text":""},{"location":"admin/runai-setup/self-hosted/ocp/backend/#install-the-control-plane","title":"Install the Control Plane","text":"<p>Run the helm command below:</p> ConnectedAirgapped <pre><code>helm repo add runai-backend https://backend-charts.storage.googleapis.com\nhelm repo update\nhelm upgrade -i runai-backend -n runai-backend runai-backend/control-plane \\ \n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ # (1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt;  # (2)\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-backend</code>.</p> <pre><code>helm upgrade -i runai-backend  ./runai-backend-&lt;version&gt;.tgz -n runai-backend \\ \n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ # (1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt;  # (2)\n</code></pre> <ol> <li>The domain configured for the OpenShift cluster. To find out the OpenShift cluster domain, run <code>oc get routes -A</code></li> <li>Name of the administrator user in the company directory.</li> </ol> <p>(replace <code>&lt;version&gt;</code> with the control plane version)</p> <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#optional-additional-configurations","title":"(Optional) Additional Configurations","text":"<p>There may be cases where you need to set additional properties as follows:</p> Key Change Description <code>keycloakx.adminUser</code> User name of the internal identity provider administrator This user is the administrator of Keycloak <code>keycloakx.adminPassword</code> Password of the internal identity provider administrator This password is for the administrator of Keycloak <code>global.postgresql.auth.username</code> PostgreSQL username Override the Run:ai default user name for the Run:ai database <code>global.postgresql.auth.password</code> PostgreSQL password Override the Run:ai default password for the Run:ai database <code>grafana.adminUser</code> Grafana username Override the Run:ai default user name for accessing Grafana <code>grafana.adminPassword</code> Grafana password Override the Run:ai default password for accessing Grafana <code>thanos.receive.persistence.storageClass</code> and <code>postgresql.primary.persistence.storageClass</code> Storage class The installation to work with a specific storage class rather than the default one <code>global.imagePullSecrets:</code> <code>- name: &lt;secret-name&gt;</code> Docker secret Provide credentials for accessing the organization's docker registry. This is required for air-gapped environments <code>&lt;component&gt;</code> <code>resources:</code> <code>limits:</code> <code>cpu: 500m</code> <code>memory: 512Mi</code> <code>requests:</code> <code>cpu: 250m</code> <code>memory: 256Mi</code> Pod request and limits <code>&lt;component&gt;</code> may be anyone of the following: <code>backend</code>, <code>frontend</code>, <code>assetsService</code>, <code>identityManager</code>, <code>tenantsManager</code>, <code>keycloakx</code>, <code>grafana</code>, <code>authorization</code>, <code>orgUnitService</code>,<code>policyService</code>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#connect-to-runai-user-interface","title":"Connect to Run:ai User Interface","text":"<ul> <li>Run: <code>oc get routes -n runai-backend</code> to find the Run:ai Administration User Interface URL. </li> <li>Log in using the default credentials: User: <code>test@run.ai</code>, Password: <code>Abcd!234</code>. </li> <li>Go to the Users area and change the password. </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/backend/#next-steps","title":"Next Steps","text":"<p>Continue with installing a Run:ai Cluster.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/","title":"Self-Hosted installation over OpenShift - Cluster Setup","text":""},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#prerequisites","title":"Prerequisites","text":"<p>Note</p> <p>You must have Cluster Administrator rights to install these dependencies. </p> <p>Before installing Run:ai, you must install NVIDIA software on your OpenShift cluster to enable GPUs.  NVIDIA has provided detailed documentation.  Follow the instructions to install the two operators <code>Node Feature Discovery</code> and <code>NVIDIA GPU Operator</code> from the OpenShift web console. </p> <p>When done, verify that the GPU Operator is installed by running:</p> <pre><code>oc get pods -n nvidia-gpu-operator\n</code></pre> <p>(the GPU Operator namespace may differ in different operator versions).</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#create-openshift-projects","title":"Create OpenShift Projects","text":"<p>Run:ai cluster installation uses several namespaces (or projects in OpenShift terminology). Run the following:</p> <pre><code>oc new-project runai\noc new-project runai-reservation\noc new-project runai-scale-adjust\n</code></pre> <p>The last namespace (<code>runai-scale-adjust</code>) is only required if the cluster is a cloud cluster and is configured for auto-scaling. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#cluster-installation","title":"Cluster Installation","text":"ConnectedAirgapped <p>Perform the cluster installation instructions explained here. When creating a new cluster, select the OpenShift  target platform.</p> <p>Info</p> <p>To install a specific version, add <code>--version &lt;version&gt;</code> to the install command. You can find available versions by running <code>helm search repo -l runai-cluster</code>.</p> <p>Perform the cluster installation instructions explained here. When creating a new cluster, select the OpenShift  target platform. The <code>helm</code> install command should use the <code>runai-cluster</code> tar file: <pre><code>helm install runai-cluster -n runai  \\ \n  runai-cluster-&lt;version&gt;.tgz -f runai-&lt;cluster-name&gt;.yaml  \n</code></pre></p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-configuration","title":"Optional Configuration","text":"<p>Make the following changes to the configuration file you have downloaded:</p> Key Change Description <code>runai-operator.config.project-controller.createNamespaces</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create namespaces, or would want to create namespaces manually rather than use the Run:ai convention of <code>runai-&lt;PROJECT-NAME&gt;</code>. When set to <code>false</code>, will require an additional manual step when creating new Run:ai Projects. <code>runai-operator.config.project-controller.clusterWideSecret</code> <code>true</code> Set to <code>false</code> if unwilling to provide Run:ai the ability to create Kubernetes Secrets. When not enabled, automatic secret propagation will not be available <code>runai-operator.config.mps-server.enabled</code> Default is <code>false</code> Allow the use of NVIDIA MPS. MPS is useful with Inference workloads. Requires extra permissions <code>runai-operator.config.runai-container-toolkit.enabled</code> <code>true</code> Controls the usage of Fractions. Requires extra cluster permissions <code>runai-operator.config.runaiBackend.password</code> Default password already set admin@run.ai password. Need to change only if you have changed the password here <p>Tip</p> <p>Use the  <code>--dry-run</code> flag to gain an understanding of what is being installed before the actual installation. For more details see understanding cluster access roles.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#optional-prometheus-adapter-for-inference","title":"(Optional) Prometheus Adapter for Inference","text":"<p>The Prometheus adapter is required if you are using Inference workloads and require a custom metric for autoscaling. The following additional steps are required for it to work:</p> <ol> <li>Copy <code>prometheus-adapter-prometheus-config</code> and <code>serving-certs-ca-bundle</code> ConfigMaps from <code>openshift-monitoring</code> namespace to the <code>monitoring</code> namespace <pre><code>kubectl get cm prometheus-adapter-prometheus-config --namespace=openshift-monitoring -o yaml \\\n  | sed 's/namespace: openshift-monitoring/namespace: monitoring/' \\\n  | kubectl create -f -\nkubectl get cm serving-certs-ca-bundle --namespace=openshift-monitoring -o yaml \\\n  | sed 's/namespace: openshift-monitoring/namespace: monitoring/' \\\n  | kubectl create -f -\n</code></pre></li> <li>Allow Prometheus Adapter <code>serviceaccount</code> to create a <code>SecurityContext</code> with RunAsUser 10001: <pre><code>oc adm policy add-scc-to-user anyuid system:serviceaccount:monitoring:runai-cluster-prometheus-adapter\n</code></pre></li> </ol>"},{"location":"admin/runai-setup/self-hosted/ocp/cluster/#next-steps","title":"Next Steps","text":"<p>Continue to create Run:ai Projects.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/next-steps/","title":"Next Steps","text":"<ul> <li>Create additional Run:ai Users.</li> <li>Set up Project-based Researcher Access Control.</li> <li>Set up Researchers to work with the Run:ai Command-line interface (CLI). See Installing the Run:ai Command-line Interface on how to install the CLI for users.</li> <li>Review advanced setup and maintenace scenarios.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/","title":"Preparing for a Run:ai OpenShift Installation","text":"<p>The following section provides IT with the information needed to prepare for a Run:ai installation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#create-openshift-projects","title":"Create OpenShift Projects","text":"<p>The Run:ai control plane uses a namespace (or project in OpenShift terminology) name <code>runai-backend</code>. You must create it before installing:</p> <pre><code>oc new-project runai-backend\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#prepare-runai-installation-artifacts","title":"Prepare Run:ai Installation Artifacts","text":""},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#runai-software-files","title":"Run:ai Software Files","text":"<p>SSH into a node with <code>oc</code> access (<code>oc</code> is the OpenShift command line) to the cluster and <code>Docker</code> installed.</p> ConnectedAirgapped <p>Run the following to enable image download from the Run:ai Container Registry on Google cloud:</p> <pre><code>oc apply -f runai-gcr-secret.yaml -n runai-backend\n</code></pre> <p>To extract Run:ai files, replace <code>&lt;VERSION&gt;</code> in the command below and run: </p> <p><pre><code>tar xvf runai-&lt;version&gt;.tar.gz\ncd deploy\n</code></pre> Upload images</p> <p>Upload images to a local Docker Registry. Set the Docker Registry address in the form of <code>NAME:PORT</code> (do not add <code>https</code>):</p> <pre><code>export REGISTRY_URL=&lt;Docker Registry address&gt;\n</code></pre> <p>Run the following script (you must have at least 20GB of free disk space to run): </p> <pre><code>sudo -E ./prepare_installation.sh\n</code></pre> <p>(If docker is configured to run as non-root then <code>sudo</code> is not required).</p> <p>The script should create a file named custom-env.yaml which will be used by the control-plane installation.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#optional-mark-runai-system-workers","title":"(Optional) Mark Run:ai System Workers","text":"<p>You can optionally set the Run:ai control plane to run on specific nodes. Kubernetes will attempt to schedule Run:ai pods to these nodes. If lacking resources, the Run:ai nodes will move to another, non-labeled node.  </p> <p>To set system worker nodes run:</p> <pre><code>kubectl label node &lt;NODE-NAME&gt; node-role.kubernetes.io/runai-system=true\n</code></pre> <p>Warning</p> <p>Do not select the Kubernetes master as a <code>runai-system</code> node. This may cause Kubernetes to stop working (specifically if Kubernetes API Server is configured on 443 instead of the default 6443).</p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#additional-permissions","title":"Additional Permissions","text":"<p>As part of the installation, you will be required to install the Control plane and Cluster Helm Charts. The Helm Charts require Kubernetes administrator permissions. You can review the exact permissions provided by using the <code>--dry-run</code> on both helm charts. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/preparations/#next-steps","title":"Next Steps","text":"<p>Continue with installing the Run:ai Control Plane.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/","title":"Self Hosted installation over OpenShift - Prerequisites","text":"<p>Before proceeding with this document, please review the installation types documentation to understand the difference between air-gapped and connected installations. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#control-plane-and-clusters","title":"Control-plane and clusters","text":"<p>As part of the installation process you will install:</p> <ul> <li>A control-plane managing cluster</li> <li>One or more clusters</li> </ul> <p>Both the control plane and clusters require Kubernetes. Typically the control plane and first cluster are installed on the same Kubernetes cluster but this is not a must. </p> <p>Important</p> <p>In OpenShift environments, adding a cluster connecting to a remote control plane currently requires the assistance of customer support.  </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#hardware-requirements","title":"Hardware Requirements","text":"<p>See Cluster prerequisites hardware requirements.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software","title":"Run:ai Software","text":"ConnectedAirgapped <p>You should receive a file: <code>runai-gcr-secret.yaml</code> from Run:ai Customer Support. The file provides access to the Run:ai Container registry.</p> <p>You should receive a single file <code>runai-&lt;version&gt;.tar</code> from Run:ai customer support</p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#runai-software-prerequisites","title":"Run:ai Software Prerequisites","text":""},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#operating-system","title":"Operating System","text":"<p>OpenShift has specific operating system requirements that can be found in the RedHat documentation. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#openshift","title":"OpenShift","text":"<p>Run:ai supports OpenShift. OpenShift Versions supported are detailed here.</p> <ul> <li>OpenShift must be configured with a trusted certificate. Run:ai installation relies on OpenShift to create certificates for subdomains. </li> <li>OpenShift must have a configured identity provider (Idp). </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#nvidia-prerequisites","title":"NVIDIA Prerequisites","text":"<p>See Run:ai Cluster prerequisites installing NVIDIA dependencies in OpenShift.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the NVIDIA prerequisites.</p> <p>Information on how to download the GPU Operator for air-gapped installation can be found in the NVIDIA GPU Operator pre-requisites. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#optional-inference-prerequisites","title":"(Optional) Inference Prerequisites","text":"<p>See Run:ai Cluster prerequisites Inference requirements.</p> <p>The Run:ai control plane, when installed without a Run:ai cluster, does not require the Inference prerequisites. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#helm","title":"Helm","text":"<p>Run:ai requires Helm. To install Helm, see https://helm.sh/docs/intro/install/. If you are installing an air-gapped version of Run:ai, The Run:ai tar file contains the helm binary. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#installer-machine","title":"Installer Machine","text":"<p>The machine running the installation script (typically the Kubernetes master) must have:</p> <ul> <li>At least 50GB of free space.</li> <li>Docker installed.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#other","title":"Other","text":"<ul> <li>(Airgapped installation only) Private Docker Registry. Run:ai assumes the existence of a Docker registry for images. Most likely installed within the organization. The installation requires the network address and port for the registry (referenced below as <code>&lt;REGISTRY_URL&gt;</code>). </li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/prerequisites/#pre-install-script","title":"Pre-install Script","text":"<p>Once you believe that the Run:ai prerequisites are met, we highly recommend installing and running the Run:ai pre-install diagnostics script. The tool:</p> <ul> <li>Tests the below requirements as well as additional failure points related to Kubernetes, NVIDIA, storage, and networking.</li> <li>Looks at additional components installed and analyzes their relevancy to a successful Run:ai installation. </li> </ul> <p>To use the script download the latest version of the script and run:</p> <pre><code>chmod +x preinstall-diagnostics-&lt;platform&gt;\n./preinstall-diagnostics-&lt;platform&gt; \n</code></pre> <p>If the script fails, or if the script succeeds but the Kubernetes system contains components other than Run:ai, locate the file <code>runai-preinstall-diagnostics.txt</code> in the current directory and send it to Run:ai technical support. </p> <p>For more information on the script including additional command-line flags, see here.</p>"},{"location":"admin/runai-setup/self-hosted/ocp/project-management/","title":"Self Hosted installation over OpenShift - Create Projects","text":""},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#introduction","title":"Introduction","text":"<p>The Administrator creates Run:ai Projects via the Run:ai User Interface. When enabling Researcher Authentication you also assign users to Projects.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. When creating a new Run:ai Project, Run:ai does the following automatically:</p> <ol> <li>Creates a namespace by the name of <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> <li>Labels the namespace as managed by Run:ai.</li> <li>Provides access to the namespace for Run:ai services.</li> <li>Associates users with the namespace. </li> </ol> <p>This process may need to be altered if, </p> <ul> <li>Researchers already have existing Kubernetes namespaces</li> <li>The organization's Kubernetes namespace naming convention does not allow the <code>runai-</code> prefix. </li> <li>The organization's policy does not allow the automatic creation of namespaces</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/project-management/#process","title":"Process","text":"<p>Run:ai allows the association of a Run:ai Project with any existing Kubernetes namespace:</p> <ul> <li>When setting up a Run:ai cluster, Disable namespace creation by setting the cluster flag <code>createNamespaces</code> to <code>false</code>.</li> <li>Using the Run:ai User Interface, create a new Project <code>&lt;PROJECT-NAME&gt;</code>. A namespace will not be created. </li> <li>Associate and existing namepace <code>&lt;NAMESPACE&gt;</code> with the Run:ai project by running:</li> </ul> <pre><code>oc label ns &lt;NAMESPACE&gt;  runai/queue=&lt;PROJECT_NAME&gt;\n</code></pre> <p>Caution</p> <p>Setting the <code>createNamespaces</code> flag to <code>false</code> moves the responsibility of creating namespaces to match Run:ai Projects to the administrator. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/uninstall/","title":"Uninstall Run:ai","text":"<p>See uninstall section here</p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/","title":"Upgrade Run:ai","text":""},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#preparations","title":"Preparations","text":"ConnectedAirgapped <p>No preparation required.</p> <ul> <li>Ask for a tar file <code>runai-air-gapped-&lt;new-version&gt;.tar</code> from Run:ai customer support. The file contains the new version you want to upgrade to. <code>new-version</code> is the updated version of the Run:ai control plane.</li> <li>Upload the images as described here.</li> </ul>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-27-or-28","title":"Upgrade from Version 2.7 or 2.8.","text":"<p>Before upgrading the control plane, run: </p> <pre><code>POSTGRES_PV=$(kubectl get pvc pvc-postgresql -n runai-backend -o jsonpath='{.spec.volumeName}')\nkubectl patch pv $POSTGRES_PV -p '{\"spec\":{\"persistentVolumeReclaimPolicy\":\"Retain\"}}'\nkubectl delete secret -n runai-backend runai-backend-postgresql\nkubectl delete sts -n runai-backend keycloak runai-backend-postgresql\n</code></pre> <p>Then upgrade the control plane as described below. Before upgrading, find customizations and merge them as discussed below. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-from-version-29-210-or-211","title":"Upgrade from Version 2.9, 2.10 or 2.11","text":"<p>Two significant changes to the control-plane installation have happened with version 2.12: PVC ownership and installation customization. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#pvc-ownership","title":"PVC Ownership","text":"<p>Run:ai will no longer directly create the PVCs that store Run:ai data (metrics and database). Instead, going forward, </p> <ul> <li>Run:ai requires a Kubernetes storage class to be installed.</li> <li>The PVCs are created by the Kubernetes StatefulSets. </li> </ul> <p>The storage class, as per Kubernetes standards, controls the reclaim behavior: whether the data is saved or deleted when the Run:ai control plane is deleted.  </p> <p>To remove the ownership in an older installation, run:</p> <pre><code>kubectl patch pvc -n runai-backend pvc-postgresql  -p '{\"metadata\": {\"annotations\":{\"helm.sh/resource-policy\": \"keep\"}}}'\n</code></pre>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#installation-customization","title":"Installation Customization","text":"<p>The Run:ai control-plane installation has been rewritten and is no longer using a backend values file. Instead, to customize the installation use standard <code>--set</code> flags. If you have previously customized the installation, you must now extract these customizations and add them as <code>--set</code> flag to the helm installation:</p> <ul> <li>Find previous customizations to the control plane if such exist. Run:ai provides a utility for that here <code>https://raw.githubusercontent.com/run-ai/docs/v2.13/install/backend/cp-helm-vals-diff.sh</code>. For information on how to use this utility please contact Run:ai customer support. </li> <li>Search for the customizations you found in the optional configurations table and add them in the new format.  </li> </ul> <p>Then upgrade the control plane as described below. </p>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-the-control-plane","title":"Upgrade the Control Plane","text":"ConnectedAirgapped <pre><code>helm upgrade -i runai-backend -n runai-backend runai-backend/control-plane  \\\n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt; \\ # (2)\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol> <pre><code>helm upgrade -i runai-backend  ./runai-backend-&lt;version&gt;.tgz -n runai-backend \\\n--set global.domain=runai.apps.&lt;OPENSHIFT-CLUSTER-DOMAIN&gt; \\ #(1)\n--set global.config.kubernetesDistribution=openshift \\\n--set backend.config.openshiftIdpFirstAdmin=&lt;FIRST_ADMIN_USER_OF_RUNAI&gt; \\ # (2)\n--set thanos.query.stores={thanos-grpc-port-forwarder:10901} \\\n--set postgresql.primary.persistence.existingClaim=pvc-postgresql\n</code></pre> <ol> <li>The subdomain configured for the OpenShift cluster.</li> <li>Name of the administrator user in the company directory.</li> </ol>"},{"location":"admin/runai-setup/self-hosted/ocp/upgrade/#upgrade-cluster","title":"Upgrade Cluster","text":"ConnectedAirgapped <p>To upgrade the cluster follow the instructions here.</p> <p><pre><code>helm get values runai-cluster -n runai &gt; values.yaml\nhelm upgrade runai-cluster -n runai runai-cluster-&lt;version&gt;.tgz -f values.yaml\n</code></pre> (replace <code>&lt;version&gt;</code> with the cluster version)</p>"},{"location":"admin/troubleshooting/cluster-health-check/","title":"Verifying Cluster Health","text":"<p>Following is a set of tests that determine the Run:ai cluster health:</p>"},{"location":"admin/troubleshooting/cluster-health-check/#verify-that-data-is-sent-to-the-cloud","title":"Verify that data is sent to the cloud","text":"<p>Log in to <code>&lt;company-name&gt;.run.ai/dashboards/now</code>.</p> <ul> <li>Verify that all metrics in the overview dashboard are showing. </li> <li>Verify that all metrics are showing in the Nodes view. </li> <li>Go to Projects and create a new Project. Find the new Project using the CLI command: <code>runai list projects</code></li> </ul>"},{"location":"admin/troubleshooting/cluster-health-check/#verify-that-the-runai-services-are-running","title":"Verify that the Run:ai services are running","text":"<p>Run:</p> <p><pre><code>kubectl get pods -n runai\nkubectl get pods -n monitoring\n</code></pre> Verify that all pods are in <code>Running</code> status and a ready state (1/1 or similar)</p> <p>Run:</p> <pre><code>kubectl get deployments -n runai\n</code></pre> <p>Check that all deployments are in a ready state (1/1)</p> <p>Run:</p> <pre><code>kubectl get daemonset -n runai\n</code></pre> <p>A Daemonset runs on every node. Some of the Run:ai daemon-sets run on all nodes. Others run only on nodes that contain GPUs. Verify that for all daemonsets the desired number is equal to  current and to ready. </p>"},{"location":"admin/troubleshooting/cluster-health-check/#submit-a-job-via-the-command-line-interface","title":"Submit a Job via the command-line interface","text":"<p>Submitting a Job will allow you to verify that the Run:ai scheduling service is in order. </p> <ul> <li>Make sure that the Project you have created has a quota of at least 1 GPU</li> <li>Run:</li> </ul> <pre><code>runai config project &lt;project-name&gt;\nrunai submit -i gcr.io/run-ai-demo/quickstart -g 1\n</code></pre> <ul> <li>Verify that the Job is in a Running state by running: </li> </ul> <pre><code>runai list jobs\n</code></pre> <ul> <li>Verify that the Job is showing in the Jobs area at <code>&lt;company-name&gt;.run.ai/jobs</code>.</li> </ul>"},{"location":"admin/troubleshooting/cluster-health-check/#submit-a-job-via-the-user-interface","title":"Submit a Job via the user interface","text":"<p>Log into the Run:ai user interface, and verify that you have a <code>Researcher</code> or <code>Research Manager</code> role.  Go to the <code>Jobs</code> area. On the top right, press the button to create a Job. Once the form opens -- submit a Job. </p>"},{"location":"admin/troubleshooting/diagnostics/","title":"Diagnostic Tools","text":""},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-the-database-container","title":"Add Verbosity to the Database container","text":"<p>Run:ai Self-hosted installation contains an internal database. To diagnose database issues, you can run the database in debug mode.</p> <p>In the runai-backend-values, search for <code>postgresql</code>. Add: </p> <pre><code>postgresql:\nimage:\ndebug: true\n</code></pre> <p>Re-install the Run:ai control-plane and then review the database logs by running: </p> <pre><code>kubectl logs -n runai-backend runai-postgresql-0\n</code></pre>"},{"location":"admin/troubleshooting/diagnostics/#internal-networking-issues","title":"Internal Networking Issues","text":"<p>Run:ai is based on Kubernetes. Kubernetes runs its own internal subnet with a separate DNS service. If you see in the logs that services have trouble connecting, the problem may reside there.  You can find further information on how to debug Kubernetes DNS here. Specifically, it is useful to start a pod with networking utilities and use it for network resolution:</p> <pre><code>kubectl run -i --tty netutils --image=dersimn/netutils -- bash\n</code></pre>"},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-prometheus","title":"Add Verbosity to Prometheus","text":"<p>Add verbosity to Prometheus by editing RunaiConfig:</p> <pre><code>kubectl edit runaiconfig runai -n runai\n</code></pre> <p>Add a <code>debug</code> log level:</p> <pre><code>prometheus-operator:\nprometheus:\nprometheusSpec:\nlogLevel: debug\n</code></pre> <p>To view logs, run: <pre><code>kubectl logs prometheus-runai-prometheus-operator-prometheus-0 prometheus \\\n      -n monitoring -f --tail 100\n</code></pre></p>"},{"location":"admin/troubleshooting/diagnostics/#add-verbosity-to-scheduler","title":"Add Verbosity to Scheduler","text":"<p>To view extended logs run:</p> <pre><code>kubectl edit ruaiconfig runai -n runai\n</code></pre> <p>Then under the <code>scheduler</code> section add:</p> <pre><code>runai-scheduler:\n   args:\n     verbosity: 6\n</code></pre> <p>Warning</p> <p>Verbose scheduler logs consume a significant amount of disk space.</p>"},{"location":"admin/troubleshooting/troubleshooting/","title":"Troubleshooting Run:ai","text":""},{"location":"admin/troubleshooting/troubleshooting/#installation","title":"Installation","text":"Upgrade fails with \"Ingress already exists\" <p>Symptom:  The installation fails with error: <code>Error: rendered manifests contain a resource that already exists. Unable to continue with install: IngressClass \"nginx\" in namespace \"\" exists</code></p> <p>Root cause: Run:ai installs <code>NGINX</code>, but there is an existing NGINX on the cluster. </p> <p>Resolution: In the Run:ai cluster YAML file, disable the installation of NGINX by setting:</p> <pre><code>ingress-nginx:\n    enabled: false\n</code></pre>"},{"location":"admin/troubleshooting/troubleshooting/#dashboard-issues","title":"Dashboard Issues","text":"No Metrics are showing on Dashboard <p>Symptom: No metrics are showing on dashboards at <code>https://&lt;company-name&gt;.run.ai/dashboards/now</code></p> <p>Typical root causes:</p> <ul> <li>Firewall-related issues.</li> <li>Internal clock is not synced.</li> <li>Prometheus pods are not running.</li> </ul> <p>Firewall issues</p> <p>Add verbosity to Prometheus as describe here.Verify that there are no errors. If there are connectivity-related errors you may need to:</p> <ul> <li>Check your firewall for outbound connections. See the required permitted URL list in Network requirements.</li> <li>If you need to set up an internet proxy or certificate, please contact Run:ai customer support. </li> </ul> <p>Machine Clocks are not synced</p> <p>Run: <code>date</code> on cluster nodes and verify that date/time is correct.  If not:</p> <ul> <li>Set the Linux time service (NTP).</li> <li>Restart Run:ai services. Depending on the previous time gap between servers, you may need to reinstall the Run:ai cluster</li> </ul> <p>Prometheus pods are not running</p> <p>Run: <code>kubectl get pods -n monitoring -o wide</code></p> <ul> <li>Verify that all pods are running.</li> <li>The default Prometheus installation is not built for high availability. If a node is down, the Prometheus pod may not recover by itself unless manually deleted. Delete the pod to see it start on a different node and consider adding a second replica to Prometheus.</li> </ul> GPU Relates metrics not showing <p>Symptom: GPU-related metrics such as <code>GPU Nodes</code> and <code>Total GPUs</code> are showing zero but other metrics, such as <code>Cluster load</code> are shown.</p> <p>Root cause: An installation issue related to the NVIDIA stack.</p> <p>Resolution: </p> <p>Need to run through the NVIDIA stack and find the issue. The current NVIDIA stack looks as follows:</p> <ol> <li>NVIDIA Drivers (at the OS level, on every node)</li> <li>NVIDIA Docker (extension to Docker, on every node)</li> <li>Kubernetes Node feature discovery (mark node properties)</li> <li>NVIDIA GPU Feature discovery (mark nodes as \u201chaving GPUs\u201d)</li> <li>NVIDIA Device plug-in (Exposes GPUs to Kubernetes)</li> <li>NVIDIA DCGM Exporter (Exposes metrics from GPUs in Kubernetes)</li> </ol> <p>Run:ai requires the installation of the NVIDIA GPU Operator which installs the entire stack above. However, there are two alternative methods for using the operator:</p> <ul> <li>Use the default operator values to install 1 through 6.</li> <li>If  NVIDIA Drivers (#1 above) are already installed on all nodes, use the operator with a flag that disables drivers install. </li> </ul> <p>For more information see Cluster prerequisites.</p> <p>NVIDIA GPU Operator</p> <p>Run: <code>kubectl get pods -n gpu-operator | grep nvidia</code> and verify that all pods are running.</p> <p>Node and GPU feature discovery</p> <p>Kubernetes Node feature discovery identifies and annotates nodes. NVIDIA GPU Feature Discovery identifies and annotates nodes with GPU properties. See that: </p> <ul> <li>All such pods are up.</li> <li>The GPU feature discovery pod is available for every node with a GPU.</li> <li>And finally, when describing nodes, they show an active <code>gpu/nvidia</code> resource.</li> </ul> <p>NVIDIA Drivers</p> <ul> <li>If NVIDIA drivers have been installed on the nodes themselves, ssh into each node and run <code>nvidia-smi</code>. Run <code>sudo systemctl status docker</code> and verify that docker is running. Run <code>nvidia-docker</code> and verify that it is installed and working.  Linux software upgrades may require a node restart.</li> <li>If NVIDIA drivers are installed by the Operator, verify that the NVIDIA driver daemonset has created a pod for each node and that all nodes are running. Review the logs of all such pods. A typical problem may be the driver version which is too advanced for the GPU hardware. You can set the driver version via operator flags. </li> </ul> <p>NVIDIA DCGM Exporter</p> <ul> <li>View the logs of the DCGM exporter pod and verify that no errors are prohibiting the sending of metrics. </li> <li>To validate that the dcgm-exporter exposes metrics, find one of the DCGM Exporter pods and run:</li> </ul> <pre><code>kubectl port-forward &lt;dcgm-exporter-pod-name&gt; 9400:9400\n</code></pre> <p>Then browse to http://localhost:9400/metrics and verify that the metrics have reached the DCGM exporter.</p> <ul> <li>The next step after the DCGM Exporter is <code>Prometheus</code>. To validate that metrics from the DCGM Exporter reach Prometheus, run:</li> </ul> <pre><code>kubectl port-forward svc/runai-cluster-kube-prometh-prometheus -n monitoring 9090:9090\n</code></pre> <p>Then browse to localhost:9090. In the UI, type <code>DCGM_FI_DEV_GPU_UTIL</code> as the metric name, and verify that the metric has reached Prometheus. </p> <p>If the DCGM Exporter is running correctly and exposing metrics, but this metric does not appear in Prometheus, there may be a connectivity issue between these components.</p> Allocation-related metrics not showing <p>Symptom: GPU Allocation-related metrics such as <code>Allocated GPUs</code> are showing zero but other metrics, such as <code>Cluster load</code> are shown.</p> <p>Root cause: The origin of such metrics is the scheduler. </p> <p>Resolution:</p> <ul> <li>Run: <code>kubectl get pods -n runai | grep scheduler</code>. Verify that the pod is running.</li> <li>Review the scheduler logs and look for errors. If such errors exist, contact Run:ai customer support. </li> </ul> All metrics are showing \"No Data\" <p>Symptom: All data on all dashboards is showing the text \"No Data\".</p> <p>Root cause: Internal issue with metrics infrastructure.</p> <p>Resolution: Please contact Run:ai customer support.</p>"},{"location":"admin/troubleshooting/troubleshooting/#authentication-issues","title":"Authentication Issues","text":"After a successful login, you are redirected to the same login page <p>For a self-hosted installation, check Linux clock synchronization as described above. Use the Run:ai pre-install script to test this automatically. </p> Single-sign-on issues <p>For single-sign-on issues, see the troubleshooting section in the single-sign-on configuration document. </p>"},{"location":"admin/troubleshooting/troubleshooting/#user-interface-submit-job-issues","title":"User Interface Submit Job Issues","text":"New Job button is grayed out <p>Symptom: The <code>New Job</code> button on the top right of the Job list is grayed out.</p> <p>Root Cause: This can happen due to multiple configuration issues: </p> <ul> <li>Open Chrome developer tools and refresh the screen.</li> <li>Under <code>Network</code> locate a network call error. Search for the HTTP error code.</li> </ul> <p>Resolution for 401 HTTP Error</p> <ul> <li>The Cluster certificate provided as part of the installation is valid and trusted (not self-signed).</li> <li>Researcher Authentication has not been properly configured. Try running <code>runai login</code> from the Command-line interface. Alternatively, run: <code>kubectl get pods -n kube-system</code>, identify the api-server pod and review its logs. </li> </ul> <p>Resolution for 403 HTTP Error</p> <p>Run: <code>kubectl get pods -n runai</code>, identify the <code>agent</code> pod, see that it's running, and review its logs.</p> New Job button is not showing <p>Symptom: The <code>New Job</code> button on the top right of the Job list does not show.</p> <p>Root Causes: (multiple)</p> <ul> <li>You do not have <code>Researcher</code> or <code>Research Manager</code> permissions.</li> <li>Under <code>Settings | General</code>, verify that <code>Unified UI</code> is on.</li> </ul> Submit form is distorted <p>Symptom: Submit form is showing vertical lines.</p> <p>Root Cause: The control plane does not know the cluster URL.</p> <p>Using the Run:ai user interface, go to the Clusters list. See that there is no cluster URL next to your cluster.</p> <p>Resolution: Cluster must be re-installed. </p> Submit form does not show the list of Projects <p>Symptom: When connected with Single-sign-on, in the Submit form, the list of Projects is empty.</p> <p>Root Cause:  SSO is on and researcher authentication is not properly configured as such.</p> <p>Resolution: Verify API Server settings as described in Researcher Authentication configuration.</p> Job form is not opening on OpenShift <p>Symptom: When clicking on \"New Job\" the Job forms does not load. Network shows 405</p> <p>Root Cause: An installation step has been missed. </p> <p>Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a <code>patch</code> command at the end of the instruction set. Run it. </p>"},{"location":"admin/troubleshooting/troubleshooting/#networking-issues","title":"Networking Issues","text":"'admission controller' connectivity issue <p>Symptoms:</p> <ul> <li>Pods are failing with 'admission controller' connectivity errors.</li> <li>The command-line <code>runai submit</code> fails with an 'admission controller' connectivity error.</li> <li>Agent or cluster sync pods are crashing in self-hosted installation.</li> </ul> <p>Root cause: Connectivity issues between different nodes in the cluster.</p> <p>Resolution:</p> <ul> <li>Run the preinstall script and search for networking errors.</li> <li>Run: <code>kubectl get pods -n kube-system -o wide</code>. Verify that all networking pods are running. </li> <li>Run: <code>kubectl get nodes</code>. Check that all nodes are ready and connected.</li> <li>Run: <code>kubectl get pods -o wide -A</code> to see which pods are Pending or in Error and which nodes they belong to. </li> <li>See if pods from different nodes have trouble communicating with each other.</li> <li>Advanced, run: <code>kubectl exec &lt;pod-name&gt; -it /bin/sh</code> from a pod in one node and ping a pod from another. </li> </ul> Projects are not syncing <p>Symptom: Create a Project on the Run:ai user interface, then run: <code>runai list projects</code>. The new Project does not appear.</p> <p>Root cause: The Run:ai agent is not syncing properly. This may be due to firewall issues. </p> <p>Resolution</p> <ul> <li>Run: <code>runai pods -n runai | grep agent</code>. See that the agent is in Running state. Select the agent's full name and run: <code>kubectl logs -n runai runai-agent-&lt;id&gt;</code>.</li> <li>Verify that there are no errors. If there are connectivity-related errors you may need to check your firewall for outbound connections. See the required permitted URL list in Network requirements. </li> <li>If you need to set up an internet proxy or certificate, please contact Run:ai customer support. </li> </ul> Jobs are not syncing <p>Symptom: A Job on the cluster (<code>runai list jobs</code>) does not show in the Run:ai user interface Job list. </p> <p>Root cause: The Run:ai cluster-sync pod is not syncing properly.  </p> <p>Resolution: Search the cluster-sync pod for errors.</p>"},{"location":"admin/troubleshooting/troubleshooting/#job-related-issues","title":"Job-related Issues","text":"Jobs fail with ContainerCannotRun status  <p>Symptom: When running <code>runai list jobs</code>, your Job has a status of <code>ContainerCannotRun</code>.</p> <p>Root Cause: The issue may be caused due to an unattended upgrade of the NVIDIA driver.</p> <p>To verify, run: <code>runai describe job &lt;job-name&gt;</code>, and search for an error <code>driver/library version mismatch</code>.</p> <p>Resolution: Reboot the node on which the Job attempted to run.</p> <p>Going forward, we recommend blacklisting NVIDIA driver from unattended-upgrades. You can do that by editing <code>/etc/apt/apt.conf.d/50unattended-upgrades</code>, and adding <code>nvidia-driver-</code> to the <code>Unattended-Upgrade::Package-Blacklist</code> section. It should look something like that:</p> <pre><code>Unattended-Upgrade::Package-Blacklist {\n    // The following matches all packages starting with linux-\n    //  \"linux-\";\n    \"nvidia-driver-\";\n</code></pre>"},{"location":"admin/troubleshooting/troubleshooting/#inference-issues","title":"Inference Issues","text":"New Deployment button is grayed out <p>Symptoms: </p> <ul> <li>The <code>New Deployment</code> button on the top right of the Deployment list is grayed out.</li> <li>Cannot create a deployment via Inference API.</li> </ul> <p>Root Cause: Run:ai Inference prerequisites have not been met.</p> <p>Resolution: Review inference prerequisites and install accordingly.</p> New Deployment button is not showing <p>Symptom: The <code>New Deployment</code> button on the top right of the Deployments list does not show.</p> <p>Root Cause: You do not have <code>ML Engineer</code> permissions.</p> Submitted Deployment remains in Pending state <p>Symptom: A submitted deployment is not running.</p> <p>Root Cause: The patch statement to add the runai-scheduler has not been performed.</p> Some Autoscaling metrics are not working <p>Symptom: Deployments do not autoscale when using metrics other than <code>requests-per-second</code> or <code>concurrency</code>.</p> <p>Root Cause: The horizontal pod autoscaler prerequisite has not been installed. </p> Deployment status is \"Failed\" <p>Symptom: Deployment status is always <code>Failed</code>.</p> <p>Root Cause: (multiple)</p> <ul> <li>Not enough resources in the cluster.</li> <li>Server model command is misconfigured (i.e sleep infinity).</li> <li>Server port is misconfigured. </li> </ul> Deployment does not scale up from zero <p>Symptom: In the Deployment form, when \"Auto-scaling\" is enabled, and \"Minimum Replicas\" is set to zero, the deployment cannot scale up from zero.</p> <p>Root Cause: </p> <ul> <li>Clients are not sending requests.</li> <li>Clients are not using the same port/protocol as the server model.</li> <li>Server model command is misconfigured (i.e sleep infinity).</li> </ul>"},{"location":"admin/troubleshooting/troubleshooting/#command-line-interface-issues","title":"Command-line interface Issues","text":"Unable to install CLI due to certificate errors <p>Symptom: The curl command and download button to download the CLI is not working.</p> <p>Root Cause: The cluster is not accessible from the download location</p> <p>Resolution: </p> <p>Use an alternate method for downloading the CLI. Run:</p> <pre><code>kubectl port-forward -n runai svc/researcher-service 4180\n</code></pre> <p>In another shell, run: <pre><code>wget --content-disposition http://localhost:4180/cli/linux\n</code></pre></p> When running the CLI you get an error: open .../.kube/config.lock: permission denied <p>Symptom: When running any CLI command you get a permission denied error.</p> <p>Root Cause: The user running the CLI does not have read permissions to the <code>.kube</code> directory.</p> <p>Resolution: Change permissions for the directory.</p> When running 'runai logs', the logs are delayed <p>Symptom: Printout from the container is not immediately shown in the log. </p> <p>Root Cause: By default, Python buffers stdout, and stderr, which are not flushed in real-time. This may cause logs to appear sometimes minutes after being buffered.</p> <p>Resolution: Set the env var PYTHONUNBUFFERED to any non-empty string or pass -u to Python. e.g. <code>python -u main.py</code>.</p> CLI does not download properly on OpenShift <p>Symptom: When trying to download the CLI on OpenShift, the <code>wget</code> statement downloads a text file named <code>darwin</code> or <code>linux</code> rather than the binary <code>runai</code>.</p> <p>Root Cause: An installation step has been missed. </p> <p>Resolution: Open the Cluster list and open the cluster installation wizard again. After selecting OpenShift, you will see a <code>patch</code> command at the end of the instruction set. Run it. </p>"},{"location":"admin/workloads/inference-overview/","title":"Inference","text":""},{"location":"admin/workloads/inference-overview/#what-is-inference","title":"What is Inference","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"admin/workloads/inference-overview/#inference-and-gpus","title":"Inference and GPUs","text":"<p>The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process. </p> <p>Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory. </p>"},{"location":"admin/workloads/inference-overview/#inference-runai","title":"Inference @Run:ai","text":"<p>Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.</p> <ul> <li> <p>Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.</p> </li> <li> <p>Inference workloads will receive priority over Train and Build workloads during scheduling.</p> </li> <li> <p>Inference is implemented as a Kubernetes Deployment object with a defined number of replicas. The replicas are load-balanced by Kubernetes so adding more replicas will improve the overall throughput of the system.</p> </li> <li> <p>Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.</p> </li> <li> <p>Inference workloads can be submitted via Run:ai user interface as well as Run:ai API. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect. </p> </li> </ul>"},{"location":"admin/workloads/inference-overview/#auto-scaling","title":"Auto Scaling","text":"<p>To withstand SLA, Inference workloads are typically set with auto scaling. Auto-scaling is the ability to add more computing power (Kubernetes pods) when the load increases and shrink allocated resources when the system is idle.</p> <p>There are a number of ways to trigger auto-scaling. Run:ai supports the following:</p> Metric Units Run:ai name GPU Utilization % gpu-utilization CPU Utilization % cpu-utilization Latency milliseconds latency Throughput requests/second throughput Concurrency concurrency Custom metric custom <p>The Minimum and Maximum number of replicas can be configured as part of the autoscaling configuration.</p> <p>Auto Scaling also supports a scale to zero policy with Throughput and Concurrency metrics, meaning that given enough time under the target threshold, the number of replicas will be scaled down to 0. This has the benefit of conserving resources at the risk of a delay from \"cold starting\" the model when traffic resumes. </p>"},{"location":"admin/workloads/inference-overview/#see-also","title":"See Also","text":"<ul> <li>To set up Inference, see Cluster installation prerequisites.</li> <li>For running Inference see Inference quick-start.</li> <li>To run Inference from the user interface see Deployments.</li> <li>To run Inference using API see Workload overview.</li> </ul>"},{"location":"admin/workloads/policies/","title":"Configure Policies","text":""},{"location":"admin/workloads/policies/#what-are-policies","title":"What are Policies?","text":"<p>Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For example:</p> <ol> <li>Restrict researchers from requesting more than 2 GPUs, or less than 1GB of memory for an interactive workload.</li> <li>Set the default memory of each training job to 1GB, or mount a default volume to be used by any submitted Workload.</li> </ol> <p>Policies are stored as Kubernetes custom resources.</p> <p>Policies are specific to Workload type as such there are several kinds of Policies:</p> Workload Type Kubernetes Workload Name Kubernetes Policy Name Interactive <code>InteractiveWorkload</code> <code>InteractivePolicy</code> Training <code>TrainingWorkload</code> <code>TrainingPolicy</code> Distributed Training <code>DistributedWorkload</code> <code>DistributedPolicy</code> Inference <code>InferenceWorkload</code> <code>InferencePolicy</code> <p>A Policy can be created per Run:ai Project (Kubernetes namespace). Additionally, a Policy resource can be created in the <code>runai</code> namespace. This special Policy will take effect when there is no project-specific Policy for the relevant workload kind.</p> <p>When researchers create a new interactive workload or workspace, they see list of available node pools and their priority. Priority is set by dragging and dropping the node pools in the desired order of priority. When the node pool priority list is locked by an administrator policy, the node pool list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</p>"},{"location":"admin/workloads/policies/#creating-a-policy","title":"Creating a Policy","text":""},{"location":"admin/workloads/policies/#creating-your-first-policy","title":"Creating your First Policy","text":"<p>To create a sample <code>InteractivePolicy</code>, prepare a file (e.g. <code>policy.yaml</code>) containing the following YAML:</p> gpupolicy.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\nname: interactive-policy1\nnamespace: runai-team-a # (1)\nspec:\ngpu:\nrules:\nrequired: true\nmin: \"1\"  # (2)\nmax: \"4\"  value: \"1\"\n</code></pre> <ol> <li>Set the Project namespace here.</li> <li>GPU values are quoted as they can contain non-integer values.</li> </ol> <p>The policy places a default and limit on the available values for GPU allocation. To apply this policy, run:</p> <p><pre><code>kubectl apply -f gpupolicy.yaml </code></pre> Now, try the following command: <pre><code>runai submit --gpu 5 --interactive -p team-a\n</code></pre> The following message will appear: <pre><code>gpu: must be no greater than 4\n</code></pre> A similar message will appear in the New Job form of the Run:ai user interface, when attempting to enter the number of GPUs, which is out of range for an Interactive tab.  </p>"},{"location":"admin/workloads/policies/#read-only-values","title":"Read-only values","text":"<p>When you do not want the user to be able to change a value, you can force the corresponding user interface control to become read-only by using the <code>canEdit</code> key. For example, </p> runasuserpolicy.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: train-policy1\nnamespace: runai-team-a # (1) \nspec:\nrunAsUser:\nrules:\nrequired: true  # (2)\ncanEdit: false  # (3)\nvalue: true # (4)\n</code></pre> <ol> <li>Set the Project namespace here.</li> <li>The field is required. </li> <li>The field will be shown as read-only in the user interface. </li> <li>The field value is true.  </li> </ol>"},{"location":"admin/workloads/policies/#complex-values","title":"Complex Values","text":"<p>The example above illustrated rules for parameters of \"primitive\" types, such as GPU allocation, CPU memory, working directory, etc. These parameters contain a single value. </p> <p>Other workload parameters, such as ports or volumes, are \"complex\", in the sense that they may contain multiple values: a workload may contain multiple ports and multiple volumes. </p> <p>The following is an example of a policy containing the value <code>ports</code>, which is complex: The <code>ports</code> flag typically contains two values: The <code>external</code> port that is mapped to an internal <code>container</code> port. One can have multiple port tuples defined for a single Workload:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractivePolicy\nmetadata:\nname: interactive-policy\nnamespace: runai\nspec:\nports:\nrules:\ncanAdd: true\nitemRules:\ncontainer:\nmin: 30000\nmax: 32767\nexternal:\nmax: 32767\nitems:\nadmin-port-a:\nrules:\ncanRemove: false\ncanEdit: false\nvalue:\ncontainer: 30100\nexternal: 8080\nadmin-port-b:\nvalue:\ncontainer: 30101\nexternal: 8081\n</code></pre> <p>A policy for a complex field is composed of three parts:</p> <ul> <li>Rules: Rules apply to the <code>ports</code> parameter as a whole. In this example, the administrator specifies <code>canAdd</code> rule with <code>true</code> value, indicating that a researcher submitting an interactive job can add additional ports to the ports listed by the policy (true is the default for <code>canAdd</code>, so it actually could have been omitted from the policy above). When <code>canAdd</code> is set to <code>false</code>, the researcher will not be able to add any additional port except those already specified by the policy.</li> <li>itemRules: itemRules impose restrictions on the data members of each item, in this case - <code>container</code> and <code>external</code>. In the above example, the administrator has limited the value of <code>container</code> to 30000-32767, and the value of <code>external</code> to a maximum of 32767.</li> <li>Items: Specifies a list of default ports. Each port is an item in the ports list and given a label (e.g. <code>admin-port-b</code>). The administrator can also specify whether a researcher can change/delete ports from the submitted workload. In the above example, <code>admin-port-a</code> is hardwired and cannot be changed or deleted, while <code>admin-port-b</code> can be changed or deleted by the researcher when submitting the Workload. It is possible to specify a label using the reserved name of <code>DEFAULTS</code>. This item provides the defaults for all other items.</li> </ul> <p>The following is an example of a complex policy for PVCs which contains <code>DEFAULTS</code>.</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\n  name: tp # use your name.\n  namespace: runai-team-a # use your namespace\nspec:\n  pvcs:\n    itemRules:\n      existingPvc:\n        canEdit: false\n      claimName:\n        required: true\n    items:\n      DEFAULTS:\n        value:\n          existingPvc: true\n          path: nil\n</code></pre>"},{"location":"admin/workloads/policies/#syntax","title":"Syntax","text":"<p>The complete syntax of the policy YAML can be obtained using the <code>explain</code> command of kubectl. For example:</p> <p><pre><code>kubectl explain trainingpolicy.spec\n</code></pre> Should provide the list of all possible fields in the spec of training policies:</p> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this TrainingPolicy\nFIELDS:\nannotations &lt;Object&gt;\nSpecifies annotations to be set in the container running the created\nworkload.\narguments   &lt;Object&gt;\nIf set, the arguments are sent along with the command which overrides the\nimage's entry point of the created workload.\ncommand &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\n...\n</code></pre> <p>You can further drill down to get the syntax for <code>ports</code> by running:</p> <pre><code>kubectl explain trainingpolicy.spec.ports\n</code></pre> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/v2alpha1\nRESOURCE: ports &lt;Object&gt;\nDESCRIPTION:\nSpecify the set of ports exposed from the container running the created\nworkload. Used together with --service-type.\nFIELDS:\nitemRules    &lt;Object&gt;\nitems    &lt;map[string]Object&gt;\nrules    &lt;Object&gt;\nthese rules apply to a value of type map (=non primitive) as a whole\nadditionally there are rules which apply for specific items of the map\n</code></pre> <p>Drill down into the <code>ports.rules</code> object by running:</p> <pre><code>kubectl explain trainingpolicy.spec.ports.rules\n</code></pre> <pre><code>KIND:     TrainingPolicy\nVERSION:  run.ai/\nRESOURCE: rules &lt;Object&gt;\nDESCRIPTION:\nthese rules apply to a value of type map (=non primitive) as a whole\nadditionally there are rules which apply for specific items of the map\nFIELDS:\ncanAdd   &lt;boolean&gt;\nis it allowed for a workload to add items to this map\nrequired &lt;boolean&gt;\nif the map as a whole is required\n</code></pre> <p>Note that each kind of policy has a slightly different set of parameters. For example, an <code>InteractivePolicy</code> has a <code>jupyter</code> parameter that is not available under <code>TrainingPolicy</code>. </p>"},{"location":"admin/workloads/policies/#using-secrets-for-environment-variables","title":"Using Secrets for Environment Variables","text":"<p>It is possible to add values from Kubernetes secrets as the value of environment variables included in the policy. The secret will be extracted from the secret object when the Job is created. For example:</p> <pre><code>  environment:\nitems:\nMYPASSWORD:\nvalue: \"SECRET:my-secret,password\"\n</code></pre> <p>When submitting a workload that is affected by this policy, the created container will have an environment variable called <code>MYPASSWORD</code> whose value is the key <code>password</code> residing in Kubernetes secret <code>my-secret</code> which has been pre-created in the namespace where the workload runs.</p> <p>Note</p> <p>Run:ai provides a secret propagation mechanism from the <code>runai</code> namespace to all project namespaces. For further information see secret propagation</p>"},{"location":"admin/workloads/policies/#terminate-runai-training-jobs-after-preemption-policy","title":"Terminate Run:ai training Jobs after preemption policy","text":"<p>Administrators can set a \u2018termination after preemption\u2019 policy to Run:ai training jobs. After applying this policy, a training job will be terminated once it has been preempted from any reason. For example, a training job that is using over-quota resources (e.g. GPUs) and the owner of those GPUs wants to reclaim them back, the Training job is preempted and typically goes back to the pending queue. However, if the termination policy is applied, the job is terminated instead of reinstated as pending. The Termination after Preemption Policy can be set as a cluster-wide policy (applicable to all namespaces/projects) or per project/namespace.</p> <p>To use this feature the administrator should configure either a cluster wide or namespace policy.</p> <p>For cluster wide (all namespaces/projects) use this YAML based policy:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: training-policy\nnamespace: runai\nspec:\nterminateAfterPreemption:\nvalue: true\n</code></pre> <p>For per namespace (project) use this YAML based policy:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingPolicy\nmetadata:\nname: training-policy\nnamespace: runai-&lt;PROJECT_NAME&gt;\nspec:\nterminateAfterPreemption:\nvalue: false\n</code></pre>"},{"location":"admin/workloads/policies/#modifyingdeleting-policies","title":"Modifying/Deleting Policies","text":"<p>Use the standard kubectl get/apply/delete commands to modify and delete policies.</p> <p>For example, to view the global interactive policy:</p> <pre><code>kubectl get interactivepolicies -n runai\n</code></pre> <p>Should return the following:</p> <pre><code>NAME                 AGE\ninteractive-policy   2d3h\n</code></pre> <p>To delete this policy:</p> <pre><code>kubectl delete InteractivePolicy interactive-policy -n runai\n</code></pre> <p>To access project-specific policies, replace the <code>-n runai</code> parameter with the namespace of the relevant project.</p>"},{"location":"admin/workloads/policies/#see-also","title":"See Also","text":"<ul> <li>For creating workloads based on policies, see the Run:ai submitting workloads</li> </ul>"},{"location":"admin/workloads/secrets/","title":"Secrets in Workloads","text":""},{"location":"admin/workloads/secrets/#kubernetes-secrets","title":"Kubernetes Secrets","text":"<p>Sometimes you want to use sensitive information within your code. For example passwords, OAuth tokens, or ssh keys. The best practice for saving such information in Kubernetes is via Kubernetes Secrets. Kubernetes Secrets let you store and manage sensitive information. Access to secrets is limited via configuration.</p> <p>A Kubernetes secret may hold multiple key - value pairs.</p>"},{"location":"admin/workloads/secrets/#using-secrets-in-runai-workloads","title":"Using Secrets in Run:ai Workloads","text":"<p>Our goal is to provide Run:ai Workloads with secrets as input in a secure way. Using the Run:ai command line, you will be able to pass a reference to a secret that already exists in Kubernetes. </p>"},{"location":"admin/workloads/secrets/#creating-a-secret","title":"Creating a secret","text":"<p>For details on how to create a Kubernetes secret see: https://kubernetes.io/docs/concepts/configuration/secret/. Here is an example:</p> <pre><code>apiVersion: v1\nkind: Secret\nmetadata:\nname: my-secret\nnamespace: runai-&lt;project-name&gt;\ndata:\nusername: am9obgo=\npassword: bXktcGFzc3dvcmQK\n</code></pre> <p>Then run: <pre><code>kubectl apply -f &lt;file-name&gt;\n</code></pre></p> <p>Notes</p> <ul> <li>Secrets are base64 encoded</li> <li>Secrets are stored in the scope of a namespace and will not be accessible from other namespaces. Hence the reference to the Run:ai Project name above. Run:ai provides the ability to propagate secrets throughout all Run:ai Projects. See below.</li> </ul>"},{"location":"admin/workloads/secrets/#attaching-a-secret-to-a-workload-on-submit","title":"Attaching a secret to a Workload on Submit","text":"<p>When you submit a new Workload, you will want to connect the secret to the new Workload. To do that, run:</p> <pre><code>runai submit -e &lt;ENV-VARIABLE&gt;=SECRET:&lt;secret-name&gt;,&lt;secret-key&gt; ....\n</code></pre> <p>For example:</p> <pre><code>runai submit -i ubuntu -e MYUSERNAME=SECRET:my-secret,username\n</code></pre>"},{"location":"admin/workloads/secrets/#secrets-and-projects","title":"Secrets and Projects","text":"<p>As per the note above, secrets are namespace-specific. If your secret relates to all Run:ai Projects, do the following to propagate the secret to all Projects:</p> <ul> <li>Create a secret within the <code>runai</code> namespace.</li> <li>Run the following once to allow Run:ai to propagate the secret to all Run:ai Projects:</li> </ul> <pre><code>runai-adm set secret &lt;secret name&gt; --cluster-wide\n</code></pre> <p>Reminder</p> <p>The Run:ai Administrator CLI can be obtained here.</p> <p>To delete a secret from all Run:ai Projects, run:</p> <pre><code>runai-adm remove secret &lt;secret name&gt; --cluster-wide\n</code></pre>"},{"location":"admin/workloads/secrets/#secrets-and-policies","title":"Secrets and Policies","text":"<p>A Secret can be set at the policy level. For additional information see policies guide</p>"},{"location":"admin/workloads/workload-overview-admin/","title":"Workloads Overview","text":""},{"location":"admin/workloads/workload-overview-admin/#workloads","title":"Workloads","text":"<p>Run:ai schedules Workloads. Run:ai workloads are comprised of:</p> <ul> <li>The Kubernetes object (Job, Deployment, etc) which is used to launch the container, inside which the data science code runs. </li> <li>A set of additional resources that are required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network, and more. </li> </ul> <p>All of these components are created together and deleted together when the Workload ends. </p> <p>Run:ai currently supports the following Workloads types:</p> Workload Type Kubernetes Name Description Interactive <code>InteractiveWorkload</code> Submit an interactive workload Training <code>TrainingWorkload</code> Submit a training workload Distributed Training <code>DistributedWorkload</code> Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference <code>InferenceWorkload</code> Submit an inference workload"},{"location":"admin/workloads/workload-overview-admin/#values","title":"Values","text":"<p>A Workload will typically have a list of values (sometimes called flags), such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.</p>"},{"location":"admin/workloads/workload-overview-admin/#how-to-submit","title":"How to Submit","text":"<p>A Workload can be submitted via various channels:</p> <ul> <li>The Run:ai user interface.</li> <li>The Run:ai command-line interface, via the runai submit command.</li> <li>The Run:ai Cluster API.</li> </ul>"},{"location":"admin/workloads/workload-overview-admin/#workload-policies","title":"Workload Policies","text":"<p>As an administrator, you can set Policies on Workloads.  Policies allow administrators to impose restrictions and set default values for Researcher Workloads. For more information see Workload Policies.</p>"},{"location":"developer/overview-developer/","title":"Overview: Developer Documentation","text":"<p>Developers can access Run:ai through various programmatic interfaces. </p>"},{"location":"developer/overview-developer/#api-architecture","title":"API Architecture","text":"<p>Run:ai is composed of a single, multi-tenant control plane. Each tenant can be connected to one or more GPU clusters. See Run:ai system components for detailed information.</p> <p>Below is a diagram of the Run:ai API Architecture. A developer may:</p> <ol> <li>Access the control plane via the Administrator API.</li> <li>Access any one of the GPU clusters via Cluster API.</li> <li>Access cluster metrics via the Metrics API.  </li> </ol> <p></p>"},{"location":"developer/overview-developer/#administrator-api","title":"Administrator API","text":"<p>Add, delete, modify and list Run:ai meta-data objects such as Projects, Departments, Users, and more. </p> <p>The API is provided as REST and is accessible via the control plane endpoint.  </p> <p>For more information see Administrator REST API. </p>"},{"location":"developer/overview-developer/#cluster-api","title":"Cluster API","text":"<p>Submit and delete Workloads. </p> <p>The API is provided as Kubernetes API.</p> <p>Cluster API is accessible via the GPU cluster itself. As such, multiple clusters may have multiple endpoints.</p> <p>Note</p> <p>The same functionality is also available via the Run:ai Command-line interface. The CLI provides an alternative for automating with shell scripts. </p>"},{"location":"developer/overview-developer/#metrics-api","title":"Metrics API","text":"<p>Retrieve metrics from multiple GPU clusters. </p> <p>See the Metrics API document.</p>"},{"location":"developer/overview-developer/#api-authentication","title":"API Authentication","text":"<p>See API Authentication for information on how to gain authenticated access to Run:ai APIs.</p>"},{"location":"developer/rest-auth/","title":"API Authentication","text":"<p>The following document explains how to authenticate with Run:ai APIs. </p> <p>Run:ai APIs are accessed using bearer tokens. A token can be obtained in several ways:</p> <ul> <li>When logging into the Run:ai user interface, you enter an email and password (or authenticated via single sign-on) which are used to obtain a token.</li> <li>When using the Run:ai command-line, you use a Kubernetes profile and are authenticated by pre-running <code>runai login</code> (or oc login with OpenShift). The command attachs a token to the profile and allows you access to Run:ai functionality.</li> <li>When using Run:ai APIs, you need to create an Application through the Run:ai user interface. The Application is created with specific roles and contains a secret. Using the secret you can obtain a token and use it within subsequent API calls.</li> </ul>"},{"location":"developer/rest-auth/#create-a-client-application","title":"Create a Client Application","text":"<ul> <li>Open the Run:ai Run:ai User Interface.</li> <li>Go to <code>Settings | Application</code> and create a new Application. </li> <li>Set the required roles:<ul> <li>Select <code>Researcher</code> to manipulate Jobs using the Cluster API. To provide access to a specific project, you will also need to go to <code>Application | Projects</code> and provide the Application with access to specific projects. </li> <li>Select <code>Editor</code> to manipulate Projects and Departments using the Administrator REST API. </li> <li>Select <code>Administrator</code> to manipulate Users, Tenant Settings and Clusters using the Administrator REST API.</li> </ul> </li> <li>Copy the <code>&lt;APPLICATION-NAME&gt;</code> and <code>&lt;CLIENT-SECRET&gt;</code> to be used below</li> <li>Go to <code>Settings | General</code>, under <code>Researcher Authentication</code> copy <code>&lt;REALM&gt;</code>.</li> </ul> <p>Important</p> <p>Creating Client Application tokens is only available with SaaS installations where the tenant has been created post-January 2022 or any Self-hosted installation. If you are an administrator but do not see the <code>Settings | Application</code> area, please contact Run:ai customer support.  </p>"},{"location":"developer/rest-auth/#request-an-api-token","title":"Request an API Token","text":"<p>Use the above parameters to get a temporary token to access Run:ai as follows. </p>"},{"location":"developer/rest-auth/#example-command-to-get-an-api-token","title":"Example command to get an API token","text":"<p>Replace <code>&lt;COMPANY-URL&gt;</code> below with  <code>app.run.ai</code> for SaaS installations (not <code>&lt;company&gt;.run.ai</code>) or the Run:ai user interface URL for Self-hosted installations.</p> cURLPython <pre><code>curl -X POST 'https://&lt;COMPANY-URL&gt;/auth/realms/&lt;REALM&gt;/protocol/openid-connect/token' \\\n--header 'Content-Type: application/x-www-form-urlencoded' \\\n--data-urlencode 'grant_type=client_credentials' \\\n--data-urlencode 'scope=openid' \\\n--data-urlencode 'response_type=id_token' \\\n--data-urlencode 'client_id=&lt;APPLICATION-NAME&gt;' \\\n--data-urlencode 'client_secret=&lt;CLIENT-SECRET&gt;'\n</code></pre> <pre><code>import http.client\nconn = http.client.HTTPSConnection(\"\")\npayload = \"grant_type=client_credentials&amp;client_id=&lt;APPLICATION-NAME&gt;&amp;client_secret=&lt;CLIENT_SECRET&gt;\"\nheaders = { 'content-type': \"application/x-www-form-urlencoded\" }\nconn.request(\"POST\", \"/&lt;COMPANY-URL&gt;/auth/realms/&lt;REALM&gt;/protocol/openid-connect/token\", payload, headers)\nres = conn.getresponse()\ndata = res.read()\nprint(data.decode(\"utf-8\"))\n</code></pre>"},{"location":"developer/rest-auth/#response","title":"Response","text":"<p>The API response will look as follows: </p> API Response<pre><code>{\n\"access_token\": \"...\", // (1)\n\"expires_in\": 36000,\n....\n\"token_type\": \"bearer\"\n\"id_token\": \"...\"\n}\n</code></pre> <ol> <li>Use the <code>id_token</code> as the Bearer token below.</li> </ol> <p>To call APIs, the application must pass the retrieved <code>access_token</code> as a Bearer token in the Authorization header of your HTTP request.</p> <ul> <li>To retrieve and manipulate Workloads, use the Cluster API. Researcher API works at the cluster level and you will have different endpoints for different clusters. </li> <li>To retrieve and manipulate other metadata objects, use the Administrator REST API. Administrator API works at the control-plane level and you have a single endpoint for all clusters. </li> </ul>"},{"location":"developer/admin-rest-api/overview/","title":"Administrator REST API","text":"<p>The purpose of the Administrator REST API is to provide an easy-to-use programming interface for administrative tasks.</p>"},{"location":"developer/admin-rest-api/overview/#endpoint-url-for-api","title":"Endpoint URL for API","text":"<p>The domain used for Administrator REST APIs is the same domain used to browse for the Run:ai User Interface. Either <code>&lt;company&gt;.run.ai</code>, or <code>app.run.ai</code> for older tenants or a custom URL used for Self-hosted installations.</p>"},{"location":"developer/admin-rest-api/overview/#authentication","title":"Authentication","text":"<ul> <li>Create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<code>&lt;ACCESS-TOKEN&gt;</code>). For details, see Calling REST APIs.</li> <li>Use the token for subsequent API calls. </li> </ul>"},{"location":"developer/admin-rest-api/overview/#example-usage","title":"Example Usage","text":"<p>For example, if you have an Administrator role, you can get a list of clusters by running:</p> cURLPython <pre><code>curl 'https://&lt;COMPANY-URL&gt;/v1/k8s/clusters' \\\n--header 'Accept: application/json' \\\n--header 'Content-Type: application/json' \\\n--header 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' \n</code></pre> <pre><code>import http.client\nconn = http.client.HTTPSConnection(\"https://&lt;COMPANY-URL&gt;\")\nheaders = {\n'content-type': \"application/json\",\n'authorization': \"Bearer &lt;ACCESS-TOKEN&gt;\"\n}\nconn.request(\"GET\", \"/v1/k8s/clusters\", headers=headers)\nres = conn.getresponse()\ndata = res.read()\nprint(data.decode(\"utf-8\"))\n</code></pre> <p>(replace <code>&lt;ACCESS-TOKEN&gt;</code> with the bearer token from above).</p> <p>For an additional example, see the following code. It is an example of how to use the Run:ai Administrator REST API to create a User and a Project and set the User to the Project.  </p>"},{"location":"developer/admin-rest-api/overview/#administrator-api-documentation","title":"Administrator API Documentation","text":"<p>The Administrator API provides the developer interfaces for getting and manipulating the Run:ai metadata objects such as Projects, Departments, Clusters, and Users.</p> <p>Detailed API documentation can be found at https://app.run.ai/api/docs. This represents the latest control-plane documentation. If you are running a self-hosted version, see <code>https://&lt;runai-company-url&gt;/api/docs</code>.</p> <p>Administrator API Documentation</p>"},{"location":"developer/cluster-api/other-resources/","title":"Support for other Kubernetes Applications","text":""},{"location":"developer/cluster-api/other-resources/#introduction","title":"Introduction","text":"<p>Kubernetes has several built-in resources that encapsulate running Pods. These are called Kubernetes Workloads and should not be confused with Run:ai Workloads. </p> <p>Examples of such resources are a Deployment that manages a stateless application, or a Job that runs tasks to completion. </p> <p>Run:ai natively runs Run:ai Workloads. A Run:ai workload encapsulates all the resources needed to run, creates them, and deletes them together. However, Run:ai, being an open platform allows the scheduling of any Kubernetes Workflow.</p>"},{"location":"developer/cluster-api/other-resources/#how-to","title":"How To","text":"<p>To run Kubernetes Workloads with Run:ai you must add the following to the YAML:</p> <ul> <li>A namespace that is associated with a Run:ai Project.</li> <li>A scheduler name: <code>runai-scheduler</code>.</li> <li>When using Fractions, use a specific syntax for the <code>nvidia/gpu</code> limit.</li> </ul>"},{"location":"developer/cluster-api/other-resources/#example-job","title":"Example: Job","text":"job1.yaml<pre><code>apiVersion: batch/v1\nkind: Job # (1)\nmetadata:\nname: job1\nnamespace: runai-team-a # (2)\nspec:\ntemplate:\nspec:\ncontainers:\n- name: job1-container\nimage: gcr.io/run-ai-demo/quickstart\nresources:\nlimits:\nnvidia.com/gpu: 1 # (4)\nrestartPolicy: Never\nschedulerName: runai-scheduler # (3)\n</code></pre> <ol> <li>This is a Kubernetes Job.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>The job to be scheduled with the Run:ai scheduler. </li> <li>To run with half a GPU replace 1 with \"0.5\" (with apostrophes).</li> </ol> <p>To submit the Job run:</p> <pre><code>kubectl apply -f job1.yaml\n</code></pre> <p>You will be able to see the Job in the Run:ai User interface, including all metrics and lists </p>"},{"location":"developer/cluster-api/other-resources/#example-deployment","title":"Example: Deployment","text":"deployment1.yaml<pre><code>apiVersion: apps/v1\nkind: Deployment # (1)\nmetadata:\nname: inference-1\nnamespace: runai-team-a # (2)\nspec:\nreplicas: 1\nselector:\nmatchLabels:\napp: inference-1\ntemplate:\nmetadata:\nlabels:\napp: inference-1\nspec:\ncontainers:\n- resources:\nlimits:\nnvidia.com/gpu: 1 # (4)\nimage: runai/example-marian-server\nimagePullPolicy: Always\nname: inference-1\nports:\n- containerPort: 8888\nschedulerName: runai-scheduler # (3)\n---\napiVersion: v1\nkind: Service # (5)\nmetadata:\nlabels:\napp: inference-1\nname: inference-1\nspec:\ntype: ClusterIP\nports:\n- port: 8888\ntargetPort: 8888\nselector:\napp: inference-1\n</code></pre> <ol> <li>This is a Kubernetes Deployment.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>The job to be scheduled with the Run:ai scheduler. </li> <li>To run with half a GPU replace 1 with \"0.5\" (with apostrophes).</li> <li>This example also contains the creation of a service to connect to the deployment. It is not mandatory.   </li> </ol> <p>To submit the Deployment run:</p> <pre><code>kubectl apply -f deployment1.yaml\n</code></pre>"},{"location":"developer/cluster-api/other-resources/#limitations","title":"Limitations","text":"<p>The Run:ai command line interface provides limited support for Kubernetes Workloads.</p>"},{"location":"developer/cluster-api/other-resources/#see-also","title":"See Also","text":"<p>Run:ai has specific integrations with additional third-party tools such as KubeFlow, MLFlow, and more. These integrations use the same instructions as described above. </p>"},{"location":"developer/cluster-api/submit-cron-yaml/","title":"Submit a Cron job via YAML","text":"<p> Version 2.10 and later.</p> <p>The cron command-line utility is a job scheduler typically used to set up and maintain software environments at scheduled intervals. Run:ai now supports submitting jobs with cron using a YAML file. </p> <p>To submit a job using cron, run the following command:</p> <pre><code>kubectl apply -f &lt;file_name&gt;.yaml\n</code></pre> <p>The following is an example YAML file:</p> <pre><code>apiVersion: batch/v1\nkind: CronJob\nmetadata:\nname: hello\nspec:\nschedule: \"* * * * *\"\njobTemplate:\nspec:\ntemplate:\nmetadata:\nlabels:\n- (Mandatory) runai/queue: team-a\nspec:\n(Mandatory) schedulerName: runai-scheduler\ncontainers:\n- name: hello\nimage: busybox:1.28\nimagePullPolicy: IfNotPresent\ncommand:\n- /bin/sh\n- -c\n- date; echo Hello from the Kubernetes cluster\nrestartPolicy: OnFailure\n(Optional) priorityClassName: build / train / inference / interactivePreemptible\n</code></pre>"},{"location":"developer/cluster-api/submit-rest/","title":"Submitting Workloads via HTTP/REST","text":"<p>You can submit Workloads via HTTP calls, using the Kubernetes REST API.</p>"},{"location":"developer/cluster-api/submit-rest/#submit-workload-example","title":"Submit Workload Example","text":"<p>To submit a workload via HTTP, run the following:</p> <pre><code>curl -X POST \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads' \\ \n--header 'Content-Type: application/yaml' \\\n--header 'Authorization: Bearer &lt;BEARER&gt;' \\  # (2) \n--data-raw 'apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload  # (3)\nmetadata:\n  name: job-1    spec:\n  gpu:\n    value: \"1\"\nimage:\n    value: gcr.io/run-ai-demo/quickstart\n  name:\n    value: job-1  </code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.</li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> <li>See Submitting a Workload via YAML for an explanation of the YAML-based workload.</li> </ol> <p>Run: <code>runai list jobs</code> to see the new Workload.</p>"},{"location":"developer/cluster-api/submit-rest/#delete-workload-example","title":"Delete Workload Example","text":"<p>To delete a workload run:</p> <pre><code>curl -X DELETE \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads/&lt;JOB-NAME&gt;' \\ \n--header 'Content-Type: application/yaml' \\\n--header 'Authorization: Bearer &lt;BEARER&gt;'   # (2)\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.  Replace <code>&lt;JOB-NAME&gt;</code> with the name of the Job. </li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> </ol>"},{"location":"developer/cluster-api/submit-rest/#suspendstop-workload-example","title":"Suspend/Stop workload example","text":"<p>To suspend or stop a workload run:</p> <pre><code>curl -X PATCH \\ # (1) \n'https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/interactiveworkload/&lt;JOB-NAME&gt;' \\\n--header 'Content-Type: application/json' --header 'Authorization: Bearer &lt;TOKEN&gt;'# (2) \n--data '{\"spec\":{\"active\": {\"value\": \"false\"}}}'\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code> or <code>inferenceworkloads</code> according to type.  Replace <code>&lt;JOB-NAME&gt;</code> with the name of the Job. </li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> </ol>"},{"location":"developer/cluster-api/submit-rest/#using-other-programming-languages","title":"Using other Programming Languages","text":"<p>You can use any Kubernetes client library together with the YAML documentation above to submit workloads via other programming languages. For more information see Kubernetes client libraries.</p>"},{"location":"developer/cluster-api/submit-rest/#python-example","title":"Python example","text":"<p>Create the following file and run it via python:</p> create-train.py<pre><code>import json\nimport requests\n# (1)\nurl = \"https://&lt;IP&gt;:6443/apis/run.ai/v2alpha1/namespaces/&lt;PROJECT&gt;/trainingworkloads\"\npayload = json.dumps({\n\"apiVersion\": \"run.ai/v2alpha1\",\n\"kind\": \"TrainingWorkload\",\n\"metadata\": {\n\"name\": \"train1\",\n\"namespace\": \"runai-team-a\"\n},\n\"spec\": {\n\"image\": {\n\"value\": \"gcr.io/run-ai-demo/quickstart\"\n},\n\"name\": {\n\"value\": \"train1\"\n},\n\"gpu\": {\n\"value\": \"1\"\n}\n}\n})\nheaders = {\n'Content-Type': 'application/json',\n'Authorization': 'Bearer &lt;TOKEN&gt;' #(2)\n}\nresponse = requests.request(\"POST\", url, headers=headers, data=payload) # (3)\nprint(json.dumps(json.loads(response.text), indent=4))\n</code></pre> <ol> <li>Replace <code>&lt;IP&gt;</code> with the Kubernetes control-plane endpoint (can be found in kubeconfig profile).  Replace <code>&lt;PROJECT&gt;</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).  Replace <code>trainingworkloads</code> with <code>interactiveworkloads</code>, <code>distributedworkloads</code>or <code>inferenceworkloads</code> according to type.</li> <li>Add Bearer token. To obtain a Bearer token see API authentication.</li> <li>if you do not have a valid certificate, you can add the flag <code>verify=False</code>.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/","title":"Submitting Workloads via YAML","text":"<p>You can use YAML to submit Workloads directly to Run:ai. Below are examples of how to create training, interactive and inference workloads via YAML.</p>"},{"location":"developer/cluster-api/submit-yaml/#submit-workload-example","title":"Submit Workload Example","text":"<p>Create a file named <code>training1.yaml</code> with the following text:</p> training1.yaml<pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # (1)\nmetadata:\nname: job-1  # (2) \nnamespace: runai-team-a # (3)\nspec:\ngpu:\nvalue: \"1\"\nimage:\nvalue: gcr.io/run-ai-demo/quickstart\nname:\nvalue: job-1 # (4)\n</code></pre> <ol> <li>This is a Training workload.</li> <li>Kubernetes object name. Mandatory, but has no functional significance.</li> <li>Namespace: Replace <code>runai-team-a</code> with the name of the Run:ai namespace for the specific Project (typically <code>runai-&lt;Project-Name&gt;</code>).</li> <li>Job name as appears in Run:ai. Can provide name, or create automatically if name prefix is configured. </li> </ol> <p>Change the namespace and run: <code>kubectl apply -f training1.yaml</code></p> <p>Run: <code>runai list jobs</code> to see the new Workload.</p>"},{"location":"developer/cluster-api/submit-yaml/#delete-workload-example","title":"Delete Workload Example","text":"<p>Run: <code>kubectl delete -f training1.yaml</code> to delete the Workload. </p>"},{"location":"developer/cluster-api/submit-yaml/#creating-a-yaml-syntax-from-a-cli-command","title":"Creating a YAML syntax from a CLI command","text":"<p>An easy way to create a YAML for a workload is to generate it from the <code>runai submit</code> command by using the <code>--dry-run</code> flag. For example, run:</p> <pre><code>runai submit build1 -i ubuntu -g 1 --interactive --dry-run \\\n     -- sleep infinity \n</code></pre> <p>The result will be the following Kubernetes object declaration:</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InteractiveWorkload  # (1)\nmetadata:\ncreationTimestamp: null\nlabels:\nPreviousJob: \"true\"\nname: job-0-2022-05-02t08-50-57\nnamespace: runai-team-a\nspec:\ncommand:\nvalue: sleep infinity\ngpu:\nvalue: \"1\"\nimage:\nvalue: ubuntu\nimagePullPolicy:\nvalue: Always\nname:\nvalue: job-0\n... Additional internal and status properties...\n</code></pre> <ol> <li>This is an Interactive workload.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/#inference-workload-example","title":"Inference Workload Example","text":"<p>Creating an inference workload is similar to the above two examples.</p> <pre><code>apiVersion: run.ai/v2alpha1\nkind: InferenceWorkload\nmetadata:\nname: inference1\nnamespace: runai-team-a\nspec:\nname:\nvalue: inference1\ngpu:\nvalue: \"0.5\"\nimage:\nvalue: \"gcr.io/run-ai-demo/example-triton-server\"\nminScale:\nvalue: 1\nmaxScale:\nvalue: 2\nmetric:\nvalue: concurrency # (1)\ntarget:\nvalue: 80  # (2)\nports:\nitems:\nport1:\nvalue:\ncontainer: 8000\n</code></pre> <ol> <li>Possible metrics can be <code>cpu-utilization</code>, <code>latency</code>, <code>throughput</code>, <code>concurrency</code>, <code>gpu-utilization</code>, <code>custom</code>. Different metrics may require additional installations at the cluster level. </li> <li>Inference requires a port to receive requests.</li> </ol>"},{"location":"developer/cluster-api/submit-yaml/#suspendresume-interactivetraining-workload","title":"Suspend/Resume Interactive/Training Workload","text":"<p>to suspend trainig <pre><code>apiVersion: run.ai/v2alpha1\nkind: TrainingWorkload # \nmetadata:\nname: job-1  #  \nnamespace: runai-team-a # \nspec:\ngpu:\nvalue: \"1\"\nactive:\nvalue: false\nimage:\nvalue: gcr.io/run-ai-demo/quickstart\nname:\nvalue: job-1 # \n</code></pre> In order to suspend workload set <code>active</code> value to <code>false</code> To reume it back either set <code>active</code> value to <code>true</code> or remove it entirly. </p>"},{"location":"developer/cluster-api/submit-yaml/#see-also","title":"See Also","text":"<ul> <li>To understand how to connect to the inference workload, see Inference Quickstart.</li> <li>To learn more about Inference and Run:ai see Inference overview.</li> </ul>"},{"location":"developer/cluster-api/workload-overview-dev/","title":"Workloads Overview","text":""},{"location":"developer/cluster-api/workload-overview-dev/#workloads","title":"Workloads","text":"<p>Run:ai schedules Workloads. Run:ai workloads contain:</p> <ul> <li>The Kubernetes resource (Job, Deployment, etc) that is used to launch the container inside which the data science code runs. </li> <li>A set of additional resources that is required to run the Workload. Examples: a service entry point that allows access to the Job, a persistent volume claim to access data on the network and more. </li> </ul> <p>Run:ai supports the following Workloads types:</p> Workload Type Kubernetes Name Description Interactive <code>InteractiveWorkload</code> Submit an interactive workload Training <code>TrainingWorkload</code> Submit a training workload Distributed Training <code>DistributedWorkload</code> Submit a distributed training workload using TensorFlow, PyTorch or MPI Inference <code>InferenceWorkload</code> Submit an inference workload"},{"location":"developer/cluster-api/workload-overview-dev/#values","title":"Values","text":"<p>A Workload will typically have a list of values, such as name, image, and resources. A full list of values is available in the runai-submit Command-line reference.  </p> <p>You can also find the exact YAML syntax run:</p> <pre><code>kubectl explain TrainingWorkload.spec\n</code></pre> <p>(and similarly for other Workload types).</p> <p>To get information on a specific value (e.g. <code>node type</code>), you can also run:</p> <pre><code>kubectl explain TrainingWorkload.spec.nodeType\n</code></pre> <p>Result:</p> <pre><code>KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\n\nRESOURCE: nodeType &lt;Object&gt;\n\nDESCRIPTION:\n     Specifies nodes (machines) or a group of nodes on which the workload will\n     run. To use this feature, your Administrator will need to label nodes as\n     explained in the Group Nodes guide at\n     https://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\n     can be used in conjunction with Project-based affinity. In this case, the\n     flag is used to refine the list of allowable node groups set in the\n     Project. For more information consult the Projects guide at\n     https://docs.run.ai/admin/admin-ui-setup/project-setup.\n\nFIELDS:\n   value    &lt;string&gt;\n</code></pre>"},{"location":"developer/cluster-api/workload-overview-dev/#how-to-submit","title":"How to Submit","text":"<p>A Workload can be submitted via various channels:</p> <ul> <li>The Run:ai user interface.</li> <li>The Run:ai command-line interface, via the runai submit command.</li> <li>The Run:ai Cluster API.</li> </ul>"},{"location":"developer/cluster-api/workload-overview-dev/#policies","title":"Policies","text":"<p>An Administrator can set Policies for Workload submission. Policies serve two purposes:</p> <ol> <li>To constrain the values a researcher can specify.</li> <li>To provide default values.</li> </ol> <p>For example, an administrator can,</p> <ul> <li>Set a maximum of 5 GPUs per Workload. </li> <li>Provide a default value of 1 GPU for each container. </li> </ul> <p>Each workload type has a matching kind of workload policy. For example, an <code>InteractiveWorkload</code> has a matching <code>InteractivePolicy</code></p> <p>A Policy of each type can be defined per-project. There is also a global policy that applies to any project that does not have a per-project policy.</p> <p>For further details on policies, see Policies.</p>"},{"location":"developer/cluster-api/reference/distributed/","title":"Distributed Training Workload Parameters","text":"<p>Following is a full list of all distributed workload parameters. The text below is equivalent to running <code>kubectl explain distributedworkload.spec</code>. You can also run <code>kubectl explain distributedworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     DistributedWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this DistributedWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\njobType  &lt;string&gt;\nThe type of distributed job\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpiJob   &lt;Object&gt;\nSpecific fields for distributed MPI Job\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nnonPreemptible   &lt;Object&gt;\nRequest the job to be non-preemptible\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\npyTorchJob   &lt;Object&gt;\nSpecific fields for distributed PyTorch Job\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\nrunPolicy    &lt;Object&gt;\nRunPolicy is shared between all distributed jobs\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkers  &lt;Object&gt;\nThe desired number of worker pods.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/inference/","title":"Inference Workload Parameters","text":"<p>Following is a full list of all inference workload parameters. The text below is equivalent to running <code>kubectl explain inferenceworkload.spec</code>. You can also run <code>kubectl explain inferenceworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     InferenceWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this workload\nFIELDS:\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\nclass    &lt;Object&gt;\nThe autoscaler class for knative to use\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\nisPrivateServiceUrl  &lt;Object&gt;\nConfigure the inference service to be available only on the cluster-local\nnetwork, and not on the public internet\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmaxScale &lt;Object&gt;\nThe maximum number of replicas to run\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmetric   &lt;Object&gt;\nThe predefined metric to use for autoscaling. Possible values are:\ncpu-utilization, latency, throughput, concurrency, gpu-utilization, custom.\nmetricName   &lt;Object&gt;\nThe exact metric name to use for autoscaling (overrides Metric field)\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nminScale &lt;Object&gt;\nThe minimum number of replicas to run\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntarget   &lt;Object&gt;\nThe target value for the autoscaling metric\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/interactive/","title":"Interactive Workload Parameters","text":"<p>Following is a full list of all interactive workload parameters. The text below is equivalent to running <code>kubectl explain interactiveworkload.spec</code>. You can also run <code>kubectl explain interactiveworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter.</p> <pre><code>KIND:     InteractiveWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this InteractiveWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\njupyter  &lt;Object&gt;\nIndication if an interactive workload should run jupyter notebook\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpi  &lt;Object&gt;\nThis workload produces mpijob\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nnotebookToken    &lt;Object&gt;\nA token for connecting to a Jupyter Notebook created for workloads of type\nJupyter. When token authentication is enabled, the notebook uses this token\nto authenticate requests. For more information see:\nhttps://jupyter-notebook.readthedocs.io/en/stable/security.html\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\npreemptible  &lt;Object&gt;\nSpecifies that the created workload will be preemptible. Interactive\npreemptible workloads can be scheduled above the guaranteed quota but may\nbe reclaimed at any time.\nprocesses    &lt;Object&gt;\nNumber of distributed training processes that will be allocated for the\ncreated mpijob.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nslotsPerWorker   &lt;Object&gt;\nNumber of slots to allocate per worker in the created mpijob.\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntensorboard  &lt;Object&gt;\nIndicates that this interactive workload should also run a TensorBoard\ndashboard\ntensorboardLogdir    &lt;Object&gt;\nThe TensorBoard Logs directory\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/cluster-api/reference/training/","title":"Training Workload Parameters","text":"<p>Following is a full list of all training workload parameters. The text below is equivalent to running <code>kubectl explain trainingworkload.spec</code>. You can also run <code>kubectl explain trainingworkload.spec.&lt;parameter-name&gt;</code> to see the description of a specific parameter. </p> <pre><code>KIND:     TrainingWorkload\nVERSION:  run.ai/v2alpha1\nRESOURCE: spec &lt;Object&gt;\nDESCRIPTION:\nThe specifications of this TrainingWorkload\nFIELDS:\nallowPrivilegeEscalation &lt;Object&gt;\nAllow the container running the workload and all launched processes to gain\nadditional privileges after the workload starts. For more information see\nthe \"User Identity in Container\" guide at\nhttps://docs.run.ai/admin/runai-setup/config/non-root-containers/\nannotations  &lt;Object&gt;\nSpecifies annotations to be set in the container that is running the\ncreated workload.\narguments    &lt;Object&gt;\nWhen set,contains the arguments sent along with the command. These override\nthe entry point of the image in the created workload.\nbackoffLimit &lt;Object&gt;\nSpecifies the number of retries before marking a workload as failed.\nDefaults to 6\nbaseWorkload &lt;string&gt;\nReference to another workload. When set, this workload inherits its values\nfrom the base workload. Base workload can either reside on the same\nnamespace of this workload (referred to as \"user\" template) or can reside\nin the runai namespace (referred to as a \"global\" template)\ncapabilities &lt;Object&gt;\nThe capabilities field allows adding a set of unix capabilities to the\ncontainer running the workload. Capabilities are Linux distinct privileges\ntraditionally associated with superuser which can be independently enabled\nand disabled. For more information see\nhttps://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-capabilities-for-a-container\ncommand  &lt;Object&gt;\nIf set, overrides the image's entry point with the supplied command.\ncompletions  &lt;Object&gt;\nUsed with Hyperparameter Optimization. Specifies the number of successful\npods the job should reach to be completed. The Job will be marked as\nsuccessful once the specified amount of pods has succeeded. The default\nvalue for 'completions' is 1. The 'parallelism' flag should be smaller or\nequal to 'completions' For more information see:\nhttps://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/\ncpu  &lt;Object&gt;\nSpecifies CPU units to allocate for the created workload (0.5, 1, .etc).\nThe workload will receive at least this amount of CPU. Note that the\nworkload will not be scheduled unless the system can guarantee this amount\nof CPUs to the workload.\ncpuLimit &lt;Object&gt;\nSpecifies a limit on the number of CPUs consumed by the workload (0.5, 1,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of CPUs.\ncreateHomeDir    &lt;Object&gt;\nInstructs the system to create a temporary home directory for the user\nwithin the container. Data stored in this directory will not be saved when\nthe container exits. When the runAsUser flag is set to true, this flag will\ndefault to true as well.\nenvironment  &lt;Object&gt;\nSpecifies environment variables to be set in the container running the\ncreated workload.\nexposedUrls  &lt;Object&gt;\nSpecifies a set of exported url (e.g. ingress) from the container running\nthe created workload.\nextendedResources    &lt;Object&gt;\nSpecifies values for extended resources. Extended resources are third-party\ndevices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that\nyou want to allocate to your Job. For more information see:\nhttps://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/\ngitSync  &lt;Object&gt;\nSpecifies git repositories to mount into the container running the\nworkload.\ngpu  &lt;Object&gt;\nSpecifies the number on the number of GPUs to allocate for the created\nworkload. The default is no allocated GPUs. The GPU value can be an integer\nor a fraction between 0 and 1.\ngpuLimit &lt;Object&gt;\nSpecifies a limit on the GPUs to allocate for this workload (1G, 20M,\n.etc). Intended to use for Opportunistic jobs (with the smart\nnode-scheduler).\ngpuMemory    &lt;Object&gt;\nSpecifies GPU memory to allocate for the created workload. The workload\nwill receive this amount of memory. Note that the workload will not be\nscheduled unless the system can guarantee this amount of GPU memory to the\nworkload.\nhostIpc  &lt;Object&gt;\nSpecifies that the created workload will use the host's ipc namespace.\nhostNetwork  &lt;Object&gt;\nSpecifies that the created workload will use the host's network stack\ninside its container. For more information see the Docker Run Reference at\nhttps://docs.docker.com/engine/reference/run/\nimage    &lt;Object&gt;\nSpecifies the image to use when creating the container running the\nworkload.\nimagePullPolicy  &lt;Object&gt;\nSpecifies the pull policy of the image when starting a container running\nthe created workload. Options are: always, ifNotPresent, or never. For more\ninformation see: https://kubernetes.io/docs/concepts/containers/images\ningressUrl   &lt;Object&gt;\nThis field is for internal use only.\nlabels   &lt;Object&gt;\nSpecifies labels to be set in the container running the created workload.\nlargeShm &lt;Object&gt;\nSpecifies a large /dev/shm device to mount into a container running the\ncreated workload. SHM is a shared file system mounted on RAM.\nmemory   &lt;Object&gt;\nSpecifies the amount of CPU memory to allocate for this workload (1G, 20M,\n.etc). The workload will receive at least this amount of memory. Note that\nthe workload will not be scheduled unless the system can guarantee this\namount of memory to the workload\nmemoryLimit  &lt;Object&gt;\nSpecifies a limit on the CPU memory to allocate for this workload (1G, 20M,\n.etc). The system guarantees that this workload will not be able to consume\nmore than this amount of memory. The workload will receive an error when\ntrying to allocate more memory than this limit.\nmigProfile   &lt;Object&gt;\nSpecifies the memory profile to be used for workload running on NVIDIA\nMulti-Instance GPU (MIG) technology.\nmountPropagation &lt;Object&gt;\nAllows for sharing volumes mounted by a container to other containers in\nthe same pod, or even to other pods on the same node. The volume mount will\nreceive all subsequent mounts that are mounted to this volume or any of its\nsubdirectories.\nmpi  &lt;Object&gt;\nThis workload produces mpijob\nname &lt;Object&gt;\nThe specific name of the created resource. Either name of namePrefix should\nbe provided, but not both.\nnamePrefix   &lt;Object&gt;\nA prefix used for assigning a name to the created resource. Either name of\nnamePrefix should be provided, but not both.\nnodePool &lt;Object&gt;\nSpecifies the NodePool name to be used to schedule this job on - DEPRECATED\nuse NodePools instead\nnodePools    &lt;Object&gt;\nSpecifies the list of node pools to use for scheduling the job, ordered by\npreference.\nnodeType &lt;Object&gt;\nSpecifies nodes (machines) or a group of nodes on which the workload will\nrun. To use this feature, your Administrator will need to label nodes as\nexplained in the Group Nodes guide at\nhttps://docs.run.ai/admin/researcher-setup/limit-to-node-group. This flag\ncan be used in conjunction with Project-based affinity. In this case, the\nflag is used to refine the list of allowable node groups set in the\nProject. For more information see the Projects setup guide at\nhttps://docs.run.ai/admin/admin-ui-setup/project-setup.\nparallelism  &lt;Object&gt;\nSpecifies the maximum desired number of pods the workload should run at any\ngiven time. The actual number of pods running in a steady state will be\nless than this number when ((.spec.completions - .status.successful) &lt;\n.spec.parallelism), i.e. when the work left to do is less than max\nparallelism. For more information, see:\nhttps://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/\npodAffinity  &lt;Object&gt;\nIndicates whether pod affinity scheduling rules applies.\npodAffinitySchedulingRule    &lt;Object&gt;\nIndicates if we want to use the Pod affinity rule as : the \"hard\"\n(required) or the \"soft\" (preferred) This field can be specified only if\nPodAffinity is set to true\npodAffinityTopology  &lt;Object&gt;\nSpecifies the Pod Affinity Topology to be used for scheduling the job This\nfield can be specified only if PodAffinity is set to true\nports    &lt;Object&gt;\nSpecifies a set of ports exposed from the container running the created\nworkload. Used together with --service-type.\nprocesses    &lt;Object&gt;\nNumber of distributed training processes that will be allocated for the\ncreated mpijob.\npvcs &lt;Object&gt;\nSpecifies persistent volume claims to mount into a container running the\ncreated workload.\nrunAsGid &lt;Object&gt;\nSpecifies the Unix group id with which the container should run. Will be\nused only if runAsUser is set to true.\nrunAsUid &lt;Object&gt;\nSpecifies the Unix user id with which the container running the created\nworkload should run. Will be used only if runAsUser is set to true.\nrunAsUser    &lt;Object&gt;\nLimits the container running the created workload to run in the context of\na specific non-root user. The user id is provided by the runAsUid field.\nThis would manifest itself in access to operating system resources, in the\nownership of new folders created under shared directories, etc.\nAlternatively, if your cluster is connected to Run:ai via SAML, you can map\nthe container to use the Linux UID/GID which is stored in the\norganization's directory. For more information see the User Identity guide\nat https://docs.run.ai/admin/runai-setup/config/non-root-containers/\ns3   &lt;Object&gt;\nSpecifies S3 buckets to mount into the container running the workload\nserviceType  &lt;Object&gt;\nSpecifies the default service exposure method for ports. The default shall\nbe used for ports which do not specify service type. Options are:\nLoadBalancer, NodePort or ClusterIP. For more information see the External\nAccess to Containers guide on\nhttps://docs.run.ai/admin/runai-setup/config/allow-external-access-to-containers/\nslotsPerWorker   &lt;Object&gt;\nNumber of slots to allocate per worker in the created mpijob.\nstdin    &lt;Object&gt;\nInstructs the system to keep stdin open for the container(s) running the\ncreated workload, even if nothing is attached.\nsupplementalGroups   &lt;Object&gt;\n';' separated list of supplemental group IDs. Will be added to the security\ncontext of the container running the created workload.\ntolerations  &lt;Object&gt;\nToleration rules which apply to the pods running the workload. Toleration\nrules guide (but do not require) the system to which node each pod can be\nscheduled to or evicted from, based on matching between those rules and the\nset of taints defined for each Kubernetes node.\nttlAfterFinish   &lt;Object&gt;\nSpecifies the duration after which it is possible for a finished workload\nto be automatically deleted. When the workload is being deleted, its\nlifecycle guarantees (e.g. finalizers) will be honored. If this field is\nunset, the workload won't be automatically deleted. If this field is set to\nzero, the workload becomes eligible to be deleted immediately after it\nfinishes. This field is alpha-level and is only honored by servers that\nenable the TTLAfterFinished feature.\ntty  &lt;Object&gt;\nInstructs the system to allocate a pseudo-TTY for the created workload.\nusage    &lt;string&gt;\nThe intended usage of this workload. possible values are \"Template\": this\nworkload is used as the base for other workloads. \"Submit\": this workload\nis used for submitting a job and/or other Kubernetes resources.\nuserId   &lt;Object&gt;\nThe user ID (\"Subject\" in the jwt-token) of the authenticated user who owns\nthe workload. The data might be used for authentication or authorization\npurposes.\nusername &lt;Object&gt;\nDisplay-only field describing the user who owns the workload. The data is\nnot used for authentication or authorization purposes.\nvolumes  &lt;Object&gt;\nSpecifies volumes to mount into a container running the created workload.\nworkingDir   &lt;Object&gt;\nSpecifies a directory that will be used as the current directory when the\ncontainer running the created workload starts.\n</code></pre>"},{"location":"developer/deprecated/inference/overview/","title":"Overview","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement. To read more about Inference see the new Inference Overview.</p>"},{"location":"developer/deprecated/inference/overview/#what-is-inference","title":"What is Inference","text":"<p>Machine learning (ML) inference is the process of running live data points into a machine-learning algorithm to calculate an output. </p> <p>With Inference, you are taking a trained Model and deploying it into a production environment. The deployment must align with the organization's production standards such as average and 95% response time as well as up-time. </p>"},{"location":"developer/deprecated/inference/overview/#inference-and-gpus","title":"Inference and GPUs","text":"<p>The Inference process is a subset of the original Training algorithm on a single datum (e.g. one sentence or one image), or a small batch. As such, GPU memory requirements are typically smaller than a full-blown Training process. </p> <p>Given that, Inference lends itself nicely to the usage of Run:ai Fractions. You can, for example, run 4 instances of an Inference server on a single GPU, each employing a fourth of the memory. </p>"},{"location":"developer/deprecated/inference/overview/#inference-runai","title":"Inference @Run:ai","text":"<p>Run:ai provides Inference services as an equal part together with the other two Workload types: Train and Build.</p> <ul> <li> <p>Inference is considered a high-priority workload as it is customer-facing. Running an Inference workload (within the Project's quota) will preempt any Run:ai Workload marked as Training.</p> </li> <li> <p>Inference is implemented as a Kubernetes Deployment with a defined number of replicas. The replicas are load-balanced by Kubernetes so that adding more replicas will improve the overall throughput of the system.</p> </li> <li> <p>Multiple replicas will appear in Run:ai as a single Inference workload. The workload will appear in all Run:ai dashboards and views as well as the Command-line interface.</p> </li> <li> <p>Inference workloads can be submitted via Run:ai Command-line interface as well as Kubernetes API/YAML. Internally, spawning an Inference workload also creates a Kubernetes Service. The service is an end-point to which clients can connect. </p> </li> </ul>"},{"location":"developer/deprecated/inference/overview/#see-also","title":"See Also","text":"<ul> <li>To setup Inference, see Inference Setup</li> <li>For running Inference see Inference quick-start</li> </ul>"},{"location":"developer/deprecated/inference/setup/","title":"Inference Setup","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement.</p> <p>Inference Jobs are an integral part of Run:ai and do not require setting up per se. However, Running multiple production-grade processes on a single GPU is best performed with an NVIDIA technology called Multi-Process Service or MPS</p> <p>By default, MPS is not enabled on GPU nodes.</p>"},{"location":"developer/deprecated/inference/setup/#enable-mps","title":"Enable MPS","text":"<p>To enable the MPS server on all nodes, you must edit the cluster installation values file:</p> <ul> <li>When installing the Run:ai cluster, edit the values file.</li> <li>On an existing installation, use the upgrade cluster instructions to modify the values file.</li> </ul> <p>Use:</p> <pre><code>runai-operator:\nconfig:\nmps-server:\nenabled: true\n</code></pre> <p>Wait for the MPS server to start running:</p> <pre><code> kubectl get pods -n runai\n</code></pre> <p>When the MPS server pod has started to run, restart the <code>nvidia-device-plugin</code> pods:</p> <pre><code>kubectl rollout restart ds/nvidia-device-plugin-daemonset -n gpu-operator\n</code></pre> <p>To enable the MPS server on selected nodes, please contact Run:ai customer support.</p>"},{"location":"developer/deprecated/inference/setup/#verify-mps-is-enabled","title":"Verify MPS is Enabled","text":"<p>Run:</p> <pre><code>kubectl get pods -n runai --selector=app=runai-mps-server -o wide\n</code></pre> <ul> <li> <p>Verify that all mps-server pods are in <code>Running</code> state. </p> </li> <li> <p>Submit a workload with MPS enabled using the --mps flag.  Then run:</p> </li> </ul> <pre><code>runai list\n</code></pre> <ul> <li>Identify the node on which the workload is running. In the <code>get pods</code> command above find the pod running on the same node and then run: </li> </ul> <pre><code>kubectl logs -n runai runai-mps-server-&lt;name&gt; -f\n</code></pre> <p>You should see activity in the log </p>"},{"location":"developer/deprecated/inference/submit-via-cli/","title":"Submit an inference Workload","text":"<p>Warning</p> <p>Inference API is deprecated. See Cluster API for its replacement.</p> <p>The easiest way to submit a new Inference workload is using the Run:ai Command-line interface. For additional information see the Inference Quickstart documentation.</p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-kubernetes-api/","title":"Submit a Run:ai Job via Kubernetes API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>This article is a complementary article to the article on launching jobs via YAML. It shows how to use a programming language and Kubernetes API to submit jobs. </p> <p>The article uses Python, though Kubernetes API is available in several other programming languages. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-kubernetes-api/#submit-a-runai-job","title":"Submit a Run:ai Job","text":"<pre><code>from __future__ import print_function\nimport kubernetes\nfrom kubernetes import client, config\nfrom pprint import pprint\nimport json\nconfig.load_kube_config()\nwith client.ApiClient() as api_client:\nnamespace = 'runai-team-a'  # Run:ai project name is prefixed by runai-\njobname = 'my-job'\nusername = 'john'  # used in un-authenticated systems only\ngpus = 1\nbody = client.V1Job(api_version=\"run.ai/v1\", kind=\"RunaiJob\")\nbody.metadata = client.V1ObjectMeta(namespace=namespace, name=jobname)\ntemplate = client.V1PodTemplate()\ntemplate.template = client.V1PodTemplateSpec()\ntemplate.template.metadata = client.V1ObjectMeta(labels = {'user' : username})\nresource = client.V1ResourceRequirements(limits= {'nvidia.com/gpu' : gpus})\ncontainer = client.V1Container(\nname=jobname, image='gcr.io/run-ai-demo/quickstart', resources=resource)\ntemplate.template.spec = client.V1PodSpec(\ncontainers=[container], restart_policy='Never', scheduler_name='runai-scheduler')\nbody.spec = client.V1JobSpec(template=template.template)\npprint(body)\ntry:\napi_instance = client.CustomObjectsApi(api_client)\napi_response = api_instance.create_namespaced_custom_object(\n\"run.ai\", \"v1\", namespace, \"runaijobs\", body)\npprint(api_response)\nexcept client.rest.ApiException as e:\nprint(\"Exception when calling AppsV1Api-&gt;create_namespaced_job: %s\\n\" % e)\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/","title":"Submit a Run:ai Job via YAML","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>You can use YAML files to submit jobs directly to Kubernetes. A frequent scenario for using the Kubernetes YAML syntax to submit Jobs is integrations. Researchers may already be working with an existing system that submits Jobs, and want to continue working with the same system. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#terminology","title":"Terminology","text":"<p>We differentiate between three types of Workloads:</p> <ul> <li>Train workloads. Train workloads are characterized by a deep learning session that has a start and an end. A Training session can take anywhere from a few minutes to a couple of weeks. It can be interrupted in the middle and later restored. Training workloads typically utilize large percentages of GPU computing power and memory.</li> <li>Build workloads. Build workloads are interactive. They are used by data scientists to write machine learning code and test it against subsets of the data. Build workloads typically do not maximize usage of the GPU. </li> <li>Inference workloads. Inference workloads are used for serving models in production. For details on how to submit Inference workloads via YAML see here.</li> </ul> <p>The internal Kubernetes implementation of a Run:ai Job is a CRD (Customer Resource) named <code>RunaiJob</code> which is similar to a Kubernetes Job. </p> <p>Run:ai extends the Kubernetes Scheduler. A Kubernetes Scheduler is the software that determines which workload to start on which node. Run:ai provides a custom scheduler named <code>runai-scheduler</code>.</p> <p>The Run:ai scheduler schedules computing resources by associating Workloads with  Run:ai Projects:</p> <ul> <li>A Project is assigned with a GPU quota through the Run:ai Run:ai User Interface. </li> <li>A workload must be associated with a Project name and will receive resources according to the defined quota for the Project and the currently running Workloads</li> </ul> <p>Internally, Run:ai Projects are implemented as Kubernetes namespaces. The scripts below assume that the code is being run after the relevant namespace has been set. </p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#submit-workloads","title":"Submit Workloads","text":"<ul> <li><code>&lt;JOB-NAME&gt;</code>. The name of the Job. </li> <li><code>&lt;IMAGE-NAME&gt;</code>. The name of the docker image to use. Example: <code>gcr.io/run-ai-demo/quickstart</code>.</li> <li><code>&lt;USER-NAME&gt;</code>. The name of the user submitting the Job. The name is used for display purposes only when Run:ai is installed in an unauthenticated mode.</li> <li><code>&lt;REQUESTED-GPUs&gt;</code>. An integer number of GPUs you request to be allocated for the Job. Examples: 1, 2.</li> <li><code>&lt;NAMESAPCE&gt;</code>. The name of the Project's namespace. This is usually <code>runai-&lt;PROJECT-NAME&gt;</code>.</li> </ul>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#regular-jobs","title":"Regular Jobs","text":"<p>Copy the following into a file and change the parameters:</p> <pre><code>apiVersion: run.ai/v1\nkind: RunaiJob (* see note below)\nmetadata:\nname: &lt;JOB-NAME&gt;\nnamespace: &lt;NAMESPACE&gt;\nlabels:\npriorityClassName: \"build\" (* see note below)\nspec:\ntemplate:\nmetadata:\nlabels:\nuser: &lt;USER-NAME&gt;\nspec:\ncontainers:\n- name: &lt;JOB-NAME&gt;\nimage: &lt;IMAGE-NAME&gt;\nresources:\nlimits:\nnvidia.com/gpu: &lt;REQUESTED-GPUs&gt;\nrestartPolicy: Never\nschedulerName: runai-scheduler\n</code></pre> <p>To submit the job, run:</p> <pre><code>kubectl apply -f &lt;FILE-NAME&gt;\n</code></pre> <p>Note</p> <ul> <li>You can use either a regular <code>Job</code> or <code>RunaiJob</code>. The latter is a Run:ai object which solves various Kubernetes Bugs and provides a better naming for multiple pods in Hyper-Parameter Optimization scenarios</li> <li>Using <code>build</code> in the <code>priorityClassName</code> field is equivalent to running a job via the CLI with a '--interactive' flag. To run a Train job, delete this line.</li> <li>The runai submit CLI command includes many more flags. These flags can be correlated with Kubernetes API functions and added to the YAML above. </li> </ul>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#using-fractional-gpus","title":"Using Fractional GPUs","text":"<p>To submit a Job with fractions of a GPU, replace <code>&lt;REQUESTED-GPUs&gt;</code> with a fraction in quotes. e.g. </p> <pre><code>limits:\nnvidia.com/gpu: \"0.5\"\n</code></pre> <p>where \"0.5\" is the requested GPU fraction.</p>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#mapping-additional-flags","title":"Mapping Additional Flags","text":"<p>Run:ai Command-Line <code>runai submit</code> has a significant number of flags. The easiest way to find out the mapping from a flag to the correct YAML attribute is to use the <code>--dry-run</code> flag.</p> <p>For example, to find the location of the <code>--large-shm</code> flag, run:</p> <pre><code>&gt; runai submit -i ubuntu --large-shm --dry-run\nTemplate YAML file can be found at:\n/var/folders/xb/rnf9b1bx2jg45c7jprv71d9m0000gn/T/job.yaml185826190\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#delete-workloads","title":"Delete Workloads","text":"<p>To delete a Run:ai workload, delete the Job:</p> <pre><code>kubectl delete runaijob &lt;JOB-NAME&gt;\n</code></pre>"},{"location":"developer/deprecated/k8s-api/launch-job-via-yaml/#see-also","title":"See Also","text":"<ul> <li>See how to use the above YAML syntax with Kubernetes API</li> <li>Use the Researcher REST API to submit, list and delete Jobs.</li> </ul>"},{"location":"developer/deprecated/k8s-api/overview/","title":"Overview: Launch a Job via Kubernetes API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>You can create, submit, list or delete jobs using the Command-line interface or the Run:ai User Interface. </p> <p>To do the same programmatically you can use the Run:ai Researcher REST API. </p> <p>You can also communicate directly with the underlying Kubernetes infrastructure by:</p> <ul> <li>Using YAML files or,</li> <li>By using a variety of programming languages to send requests to Kubernetes. See Submit a Run:ai Job via Kubernetes API for a python sample.</li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/","title":"Researcher REST API","text":"<p>Warning</p> <p>Researcher Kubernetes API is deprecated. See Cluster API for its replacement.</p> <p>The purpose of the Researcher REST API is to provide an easy-to-use programming interface for submitting, listing, and deleting Jobs. </p> <p>There are other APIs that provide the same functionality. Specifically:</p> <ul> <li>If your code is script-based, you may consider using the Run:ai command-line interface.</li> <li>You can communicate directly with the underlying Kubernetes infrastructure by sending YAML files or by using a variety of programming languages to send requests to Kubernetes. See Submit a Run:ai Job via Kubernetes API.</li> </ul> <p>The Researcher REST API is cluster-specific in the sense that if you have multiple GPU clusters, you will have a separate URL per cluster. This <code>&lt;CLUSTER-ENDPOINT&gt;</code> can be found in the Run:ai User Interface, under <code>Clusters</code>. Each cluster will have a separate URL.</p>"},{"location":"developer/deprecated/researcher-rest-api/overview/#authentication","title":"Authentication","text":"<ul> <li>By default, researcher APIs are unauthenticated. To protect researcher API, you must configure researcher authentication.</li> <li>Once configured, you must create a Client Application to make API requests. Use the client application and secret, to obtain a time-bound bearer token (<code>&lt;ACCESS-TOKEN&gt;</code>). For details, see Calling REST APIs.</li> <li>Use the token for subsequent API calls. </li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/#example","title":"Example","text":"<p>Get all the jobs for a project named <code>team-a</code>: </p> <pre><code>curl  'https://&lt;CLUSTER-ENDPOINT&gt;/researcher/api/v1/jobs/team-a' \\\n-H 'accept: application/json' \\\n--header 'Authorization: Bearer &lt;ACCESS-TOKEN&gt;' </code></pre>"},{"location":"developer/deprecated/researcher-rest-api/overview/#researcher-api-scope","title":"Researcher API Scope","text":"<p>The Researcher API provides the following functionality:</p> <ul> <li>Submit a new Job</li> <li>List jobs for specific Projects.</li> <li>Delete an existing Job</li> <li>Get a list of Projects for which you have access to</li> </ul>"},{"location":"developer/deprecated/researcher-rest-api/overview/#researcher-api-documentation","title":"Researcher API Documentation","text":"<p>To review API documentation:</p> <ul> <li>Open the Run:ai user interface</li> <li>Go to <code>Clusters</code></li> <li>Locate your cluster and browse to <code>https://&lt;cluster-url&gt;/researcher/api/docs</code>.</li> <li>When using the <code>Authenticate</code> button, add <code>Bearer &lt;ACCESS TOKEN&gt;</code> (simply adding the access token will not work).</li> </ul> <p>The document uses the Open API specification to describe the API. You can test the API within the document after creating and saving a token.</p>"},{"location":"developer/metrics/metrics/","title":"Metrics","text":""},{"location":"developer/metrics/metrics/#what-are-metrics","title":"What are Metrics","text":"<p>Metrics are numeric measurements recorded over time that are emitted from the Run:ai cluster. Typical metrics involve utilization, allocation, time measurements and so on. Metrics are used in Run:ai dashboards as well as in the Run:ai administration user interface. </p> <p>The purpose of this document is to detail the structure and purpose of metrics emitted by Run:ai to enable customers to create custom dashboards or integrate metric data into other monitoring systems. </p> <p>Run:ai uses Prometheus for collecting and querying metrics.</p>"},{"location":"developer/metrics/metrics/#published-runai-metrics","title":"Published Run:ai Metrics","text":"<p>Following is the list of published Run:ai metrics:</p> Metric name Labels Measurement Description runai_active_job_cpu_requested_cores {clusterId,  job_name, job_uuid} CPU Cores Job's requested CPU cores runai_active_job_memory_requested_bytes {clusterId,  job_name, job_uuid} Bytes Job's requested CPU memory runai_cluster_cpu_utilization \u00a0{clusterId} 0 to 1 CPU utilization of the entire cluster runai_cluster_memory_used_bytes \u00a0{clusterId} Bytes Used CPU memory of the entire cluster runai_cluster_memory_utilization \u00a0{clusterId} 0 to 1 CPU memory utilization of the entire cluster runai_gpu_is_allocated \u00a0{gpu, clusterId, node} 0/1 Is a GPU hosting a pod runai_gpu_is_running_fractional_job \u00a0{gpu, clusterId, node} 0/1 Is GPU hosting Fractional GPU jobs runai_gpu_last_active_time \u00a0{gpu, clusterId, node} Unix time Last time GPU was not idle runai_gpu_utilization_non_fractional_jobs \u00a0{job_uuid, job_name, clusterId, gpu, node} 0 to 100 Utilization per GPU for jobs running on a full GPU runai_gpu_utilization_with_pod_info \u00a0{pod_namespace, pod_name, clusterId, gpu, node} 0 to 100 GPU utilization per GPU runai_job_allocated_gpus \u00a0{job_type, job_uuid, job_name, clusterId, project} Double GPUs allocated to Jobs runai_job_gpu_utilization \u00a0{job_uuid, clusterId, job_name, project} 0 to 100 Average GPU utilization per job runai_job_image \u00a0{image, job_uuid, job_name, clusterId} N/A Image name per job runai_job_requested_gpu_memory \u00a0{job_type, job_uuid, job_name, clusterId, project} Bytes Requested GPU memory per job (0 if not specified by the user) runai_job_requested_gpus \u00a0{job_type, job_uuid, job_name, clusterId, project} Double Number of requested GPU per job runai_job_status_with_info \u00a0{user, job_type, status, job_name, clusterId, node, project} N/A Additional information on jobs runai_job_total_runtime \u00a0{clusterId, job_uuid} Seconds Total run time per job runai_job_total_wait_time \u00a0{clusterId, job_uuid} Seconds Total wait time per job runai_job_used_gpu_memory_bytes \u00a0{clusterId, job_uuid} Bytes Used GPU memory per job runai_job_used_gpu_memory_bytes_with_gpu_node \u00a0{job_uuid, job_name, clusterId, gpu, node} Bytes Used GPU memory per job, per GPU on which the job is running runai_node_cpu_requested_cores \u00a0{clusterId, node} Double Sum of the requested CPU cores of all jobs running in a node runai_node_cpu_utilization \u00a0{clusterId, node} 0 to 1 CPU utilization per node runai_node_gpu_used_memory_bytes \u00a0{clusterId, node} Bytes Used GPU memory per node runai_node_memory_utilization \u00a0{clusterId, node} 0 to 1 CPU memory utilization per node runai_node_requested_memory_bytes \u00a0{clusterId, node} Bytes Sum of the requested CPU memory of all jobs running in a node runai_node_total_memory_bytes \u00a0{clusterId, node} Bytes Total GPU memory per node runai_node_used_memory_bytes \u00a0{clusterId, node} Bytes Used CPU memory per node runai_project_guaranteed_gpus \u00a0{clusterId, project} Double Guaranteed GPU quota per project runai_project_info \u00a0{memory_quota, cpu_quota, gpu_guaranteed_quota, clusterId, project, department_name} N/A Information on CPU, CPU memory, GPU quota per project runai_running_job_cpu_limit_cores {clusterId, job_name , job_uuid} Double Jobs CPU limit (in number of cores). See link runai_running_job_cpu_requested_cores \u00a0{clusterId, job_name, job_uuid} Double Jobs requested CPU cores. See link runai_running_job_cpu_used_cores \u00a0{job_uuid, clusterId, job_name, project} Double Jobs CPU usage (in number of cores) runai_running_job_memory_limit_bytes \u00a0{clusterId, job_name, job_uuid} Bytes Jobs CPU memory limit. See link runai_running_job_memory_requested_bytes \u00a0{clusterId, job_name, job_uuid} Bytes Jobs requested CPU memory. See link runai_running_job_memory_used_bytes \u00a0{job_uuid, clusterId, job_name, project} Bytes Jobs used CPU memory runai_mig_mode_gpu_count \u00a0{clusterId, node} Double Number of GPUs on MIG nodes runai_job_swap_memory_used_bytes {clusterId, job_uuid, job_name, project, node} Bytes Used Swap CPU memory for the job runai_deployment_request_rate {clusterId, namespace_name, deployment_name} Number Rate of received HTTP requests per second runai_deployment_request_latencies {clusterId, namespace_name, deployment_name, le} Number Histogram of response time (bins are in milliseconds) <p>Additional metrics for version 2.9 and above</p> Metric name Labels Measurement Description runai_gpu_utilization_per_gpu {clusterId, gpu, node} % GPU Utilization per GPU runai_gpu_utilization_per_node {clusterId, node} % GPU Utilization per Node runai_gpu_memory_used_mebibytes_per_gpu {clusterId, gpu, node} MiB Used GPU memory per GPU runai_gpu_memory_used_mebibytes_per_node {clusterId, node} MiB Used GPU memory per Node runai_gpu_memory_total_mebibytes_per_gpu {clusterId, gpu, node} MiB Total GPU memory per GPU runai_gpu_memory_total_mebibytes_per_node {clusterId, node} MiB Toal GPU memory per Node runai_gpu_count_per_node {clusterId, node} Number Number of GPUs per Node runai_allocated_gpu_count_per_workload {clusterId, job_name, job_uuid, job_type, user} Double Number of allocated GPUs per Workload runai_allocated_gpu_count_per_project {clusterId, project} Double Number of allocated GPUs per Project runai_gpu_memory_used_mebibytes_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} MiB Used GPU Memory per Pod per Gpu runai_gpu_memory_used_mebibytes_per_workload {clusterId, job_name, job_uuid, job_type, user} MiB Used GPU Memory per Workload runai_gpu_utilization_per_pod_per_gpu {clusterId, pod_name, pod_uuid, pod_namespace, node, gpu} % GPU Utilization per Pod per GPU runai_gpu_utilization_per_workload {clusterId, job_name, job_uuid, job_type, user} % GPU Utilization per Workload runai_gpu_utilization_per_project {clusterId, project} % GPU Utilization per Project runai_last_gpu_utilization_time_per_workload {clusterId, job_name, job_uuid, job_type, user} Seconds (Unix Timestamp) The Last Time (Unix Timestamp) That The Workload Utilized Any Of His Allocated GPUs runai_gpu_idle_time_per_workload {clusterId, job_name, job_uuid, job_type, user} Seconds Seconds Passed Since The Workload Utilized Any Of His Allocated GPUs runai_allocated_gpu_count_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Double Number Of Allocated GPUs per Pod runai_allocated_gpu_count_per_node {clusterId, node} Double Number Of Allocated GPUs per Node runai_allocated_millicpus_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Integer Number Of Allocated Millicpus per Pod runai_allocated_memory_per_pod {clusterId, pod_name, pod_uuid, pod_namespace, node} Bytes Allocated Memory per Pod <p>Following is a list of labels appearing in Run:ai metrics:</p> Label Description clusterId Cluster Identifier department_name Name of Run:ai Department cpu_quota CPU limit per project gpu GPU index gpu_guaranteed_quota Guaranteed GPU quota per project image Name of Docker image namespace_name Namespace deployment_name Deployment name job_name Job name job_type Job type: training, interactive or inference job_uuid Job identifier pod_name Pod name. A Job can contain many pods. pod_namespace Pod namespace memory_quota CPU memory limit per project node Node name project Name of Run:ai Project status Job status: Running, Pending, etc. For more information on Job statuses see document user User identifier"},{"location":"developer/metrics/metrics/#other-metrics","title":"Other Metrics","text":"<p>Run:ai exports other metrics emitted by NVIDIA and Kubernetes packages, as follows:</p> Metric name Description dcgm_gpu_utilization GPU utilization kube_node_status_allocatable Resources (CPU, memory, GPU etc) are allocatble (available for scheduling) kube_node_status_capacity The capacity for different resources of a node kube_node_status_condition The condition of a cluster node kube_pod_container_resource_requests The number of requested resources by a container kube_pod_container_resource_requests_cpu_cores The number of CPU cores requested by container kube_pod_container_resource_requests_memory_bytes Bytes of memory requested by a container kube_pod_info Information about pod kube_pod_status_phase The current phase of the pod <p>For additional information, see Kubernetes kube-state-metrics and NVIDIA dcgm exporter.</p>"},{"location":"developer/metrics/metrics/#create-custom-dasbhoards","title":"Create custom dasbhoards","text":"<p>To create custom dashboards based on the above metrics, please contact Run:ai customer support.</p>"},{"location":"home/components/","title":"Run:ai System Components","text":""},{"location":"home/components/#components","title":"Components","text":"<ul> <li> <p>Run:ai is installed over a Kubernetes Cluster</p> </li> <li> <p>Researchers submit Machine Learning workloads via the Run:ai Command-Line Interface (CLI), or directly by sending YAML files to Kubernetes. </p> </li> <li> <p>Administrators monitor and set priorities via the Run:ai User Interface</p> </li> </ul> <p></p>"},{"location":"home/components/#the-runai-cluster","title":"The Run:ai Cluster","text":"<p>The Run:ai Cluster contains:</p> <ul> <li>The Run:ai Scheduler which extends the Kubernetes scheduler. It uses business rules to schedule workloads sent by Researchers. </li> <li>Fractional GPU management. Responsible for the Run:ai technology which allows Researchers to allocate parts of a GPU rather than a whole GPU  </li> <li>The Run:ai agent. Responsible for sending Monitoring data to the Run:ai Cloud.</li> <li>Clusters require outbound network connectivity to the Run:ai Cloud.  </li> </ul>"},{"location":"home/components/#kubernetes-related-details","title":"Kubernetes-Related Details","text":"<ul> <li>The Run:ai cluster is installed as a Kubernetes Operator</li> <li>Run:ai is installed in its own Kubernetes namespace named runai</li> <li>Workloads are run in the context of Projects. Each Project is a Kubernetes namespace with its own settings and access control. </li> </ul>"},{"location":"home/components/#the-runai-control-plane","title":"The Run:ai Control Plane","text":"<p>The Run:ai control plane is the basis of the Run:ai User Interface. </p> <ul> <li>The Run:ai cloud aggregates monitoring information from multiple tenants (customers).</li> <li>Each customer may manage multiple Run:ai clusters. </li> </ul> <p></p> <p>The Run:ai control plane resides on the cloud but can also be locally installed. To understand the various installation options see the installation types document.</p>"},{"location":"home/data-privacy-details/","title":"Data Privacy","text":"<p>Run:ai SaaS Cluster installation uses the Run:ai cloud as its control plane. The cluster sends information to the cloud for the purpose of control as well as dashboards. The document below is a run-down of the data that is being sent to the Run:ai cloud.</p> <p>Note</p> <p>If the data detailed below is not in line with your organization's policy, you can choose to install the Run:ai self-hosted version. The self-hosted installation includes the Run:ai control-plane and will not communicate with the cloud. The self-hosted installation has different pricing. </p>"},{"location":"home/data-privacy-details/#data","title":"Data","text":"<p>Following is a list of platform data items that are sent to the Run:ai cloud.</p> Asset Data Details Job Metrics Job names, CPU, GPU, and Memory metrics, parameters sent using the <code>runai submit</code> command Node Metrics Node names and IPs, CPU, GPU, and Memory metrics Cluster Metrics Cluster names, CPU, GPU, and Memory metrics Projects &amp; Departments Names, quota information Users User Run:ai roles, emails and passwords (when single-sign on not used) <p>Run:ai does not send deep-learning artifacts to the cloud. As such any Code, images, container logs, training data, models, checkpoints and the like, stay behind corporate firewalls. </p>"},{"location":"home/data-privacy-details/#see-also","title":"See Also","text":"<p>The Run:ai privacy policy. </p>"},{"location":"home/whats-new-2-10/","title":"Run:ai version 2.10","text":""},{"location":"home/whats-new-2-10/#version-21013","title":"Version 2.10.13","text":"Internal ID Description RUN-9383 Fixed an issue with a Bad Gateway login due to a browser cookie issue."},{"location":"home/whats-new-2-10/#version-21011","title":"Version 2.10.11","text":"<p>Added</p> Internal ID Description RUN-9920 Fixed an issue with policies where the <code>canEdit</code> rule is not validated properly for itemized fields. RUN-9912 Fixed an issue where <code>runai bash</code> does not wait for pods to be ready. RUN-8982 Fixed an issue with missing images for airgapped installations. ## Version 2.10.10 Internal ID Description RUN-9894 Fixed an issue where you cannot delete project in cluster if its not attached to a namespace. RUN-9039 Fixed and issue where in the job screen, after selecting a project, the preemptible flag is toggled from on to off and submit a job, the screen still shows the job as interactive-preemptible."},{"location":"home/whats-new-2-10/#version-2109","title":"Version 2.10.9","text":""},{"location":"home/whats-new-2-10/#release-date","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-9488 Fixed certificate error when retrieving dashboards from an external URL in self-hosted environment using a self-signed certificate. RUN-9462 Fixed scenarios where <code>supplementary groups</code> were not passed from the identity provider to the container. RUN-9278 Fixed issue in openshift environments where projects deleted from the UI are still listed in the CLI."},{"location":"home/whats-new-2-10/#version-2108","title":"Version 2.10.8","text":""},{"location":"home/whats-new-2-10/#release-date_1","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-9250 Fixed trimming of wrong characters from API server URL in the CLI command <code>runai portfoward</code>. RUN-9579 Fixed nodepool priority assignment and persistance in API and UI. RUN-9590 Fixed PVC propagation not working due to improper handling of shared PVCs' annotations. RUB-9631 Added flag <code>--pod-running-timeout</code> to CLI commands <code>runai attach</code> and <code>runai bash</code>."},{"location":"home/whats-new-2-10/#version-2107","title":"Version 2.10.7","text":""},{"location":"home/whats-new-2-10/#release-date_2","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-10/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN_9226 Fixed implementation of MIG utilization metrics in graphs. RUN-9309 Changed the <code>backoffLimit</code> default for distributed workloads from 0 to 6 * (the number of workers). RUN-9324 Fixed volume capacity check on PVC when its not immediately bound."},{"location":"home/whats-new-2-10/#version-2106","title":"Version 2.10.6","text":""},{"location":"home/whats-new-2-10/#release-date_3","title":"Release date","text":"<p>April 2023</p>"},{"location":"home/whats-new-2-10/#release-content","title":"Release content","text":"<p>runai port-forward</p> <p>The <code>port forward</code> CLI command can forward ports to any of the pods in a job.</p> <p>The <code>port forward</code> CLI command now includes the <code>pod-running-timeout</code> flag. This determines how long the command will wait for the pod to run before it times out.  The default is 10 minutes.</p> <p>Scheduler</p> <p>Corrected scheduler message about availability of \"other resources\".</p> <p>Jobs</p> <p>Fixed the output of <code>runai describe job</code> for jobs without pods.</p> <p>Cluster wide PVC</p> <p>Cluster wide PVC is now replicated to namespaces that do not have an existing PVC with the same name.</p>"},{"location":"home/whats-new-2-10/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-9196 Fixed dashboard overview displaying <code>running_workloads:cpu_only</code> rule. RUN-9256 Now supports the global configuration of memory request of memory-sensitive pods in the cluster. RUN-9219 Fixed <code>runai describe</code> on pytorch outputs \"Is Distributed Workload: false\". RUN-9221 Fixed CLI <code>runai describe</code> job nil pointer exception. RUN-9220 Fixed PVC duplication errors so that it does not duplicate for namespaces with the same PVC name and bound PVCs. RUN-9224 Fixed Scheduler not reporting the correct event on EFA (status history). RUN-9189 Improved Scheduler performance to reclaim action slowness in really big clusters. RUN-450 Change \"edit boxes\" to labels. RUN-9218 Added support for <code>pod-running-timeout</code> when using <code>runai port-forward</code>. RUN-9252 Fixed <code>runai port-forward</code> to be consistent with <code>runai bash</code> (<code>--target</code> is now <code>--pod</code>). RUN-9071 Fixed registries api call crashing the ui when returning an error. RUN-8794 Newer dashboards are now deployed for tenants using grafanlabs. RUN-9212 Fixed filter jobs by type. As a workaround, you can also you can sort by type."},{"location":"home/whats-new-2-10/#version-2105","title":"Version 2.10.5","text":""},{"location":"home/whats-new-2-10/#release-date_4","title":"Release date","text":"<p>April 2023</p>"},{"location":"home/whats-new-2-10/#release-content_1","title":"Release content","text":""},{"location":"home/whats-new-2-10/#authentication-and-access-control","title":"Authentication and access control","text":"<p>Credential Manager</p> <p>This feature provides configuration for credentials that are used to unlock protected resources such as applications, containers, and other assets. For configuration information, see Credentials.</p> <p>SSO custom URL logout</p> <p>This feature configures a custom logout URL in your tenant. For configuration information, see SSO UI Configuration.</p> <p>Department Administrator</p> <p>The new role of Department Administrator adds a layer of delegation in the administration of departments. For an explanation of the role, see Create a user. For Department configuration information, see Assigning the Department Administrator role.</p> <p>Enable SSO Using OIDC</p> <p>Added an additional SSO configuration option using OIDC as the identity provider. For configuration information, see SSO UI Configuration.</p> <p>Inactivity timeout</p> <p>Added inactivity timeout for automated logout. The inactivity timeout is configured in minutes. For configuration information, see Inactivity timeout.</p>"},{"location":"home/whats-new-2-10/#researcher-tools","title":"Researcher tools","text":"<p>PyTorch</p> <p>Added CLI support for submitting PyTorch jobs. For more information, see Submit Run:ai PyTorch job.</p> <p>TensorFlow</p> <p>Added CLI support for submitting TensorFlow jobs. For more information, see Submit Run:ai TensorFlow job.</p> <p>Cron jobs support</p> <p>Added support for cron command-line job scheduler. For more information, see Submit CRON job via YAML.</p> <p>Previous jobs menu</p> <p>The option to re-run a job is supported via the <code>Clone Job</code> action in the <code>Jobs</code> screen. The option to select a previous job in the \"New Job\" form is no longer supported</p> <p>Annotations and labels</p> <p>Added to the UI the capability to add Kubernetes annotations and labels to the new job form.</p>"},{"location":"home/whats-new-2-10/#scheduling","title":"Scheduling","text":"<p>Bin-Packing or Spread CPU scheduling strategy</p> <p>The administrator can set a cluster-wide scheduling parameter to determine if the scheduler should spread or bin-pack workloads. Added a new distinct parameter for pure CPU workloads so administrators can use different strategies for different workloads. For more information, see Scheduling Strategies</p> <p>Scheduling workloads to AWS placement groups</p> <p>Added feature to leverage Placement Groups within AWS to maximize throughput and performance of distributed training workloads. For more information, see Scheduling workloads to AWS placement groups.</p> <p>Job Status Notifications</p> <p>Added the capability to send job statuses notifications to Slack. For configuration information, see Event Router.</p>"},{"location":"home/whats-new-2-10/#storage","title":"Storage","text":"<p>Cluster wide PVC</p> <p>Added the ability to use cluster wide PVCs. A PersistentVolumeClaim (PVC) is a request for storage by a user and is similar to a Pod. Pods consume node resources and PVCs consume PV resources. Pods can request specific levels of resources (CPU and Memory). Claims can request specific size and access modes. For more information about PVCs, see Persistent Volumes. For PVC configuration, see Setting up cluster wide PVCs.</p> <p>Ephemeral PVC</p> <p>Added support Ephemeral PVC in CLI and in the job submission form. For more information, see CLI reference runai submit.</p>"},{"location":"home/whats-new-2-10/#known-issues","title":"Known issues","text":"Internal ID Description Workaround RUN-8695 SSO users that logged in via SAML can't login again after disabling and reenabling SSO. RUN-8680 A user in an OCP group with roles that belong to that group should be able to submit a job from the UI. RUN-8601 Warning when the CLI command <code>runai suspend</code> is used. RUN-8422 Remove Knative unnecessary requests when inference is not enabled. RUN-7874 A new job returns <code>malformed URL</code> when a project is not connected to a namespace. RUN-6301 A job in the job list side panel shows both <code>pending</code> and <code>running</code> at the same time."},{"location":"home/whats-new-2-10/#fixed-issues_4","title":"Fixed issues","text":"Internal ID Description RUN-8223 Missed foreign key to tenants table. RUN-5187 S3 can now be configured to work in airgapped environments. RUN-8276 503 error when creating a workload (request timeout for validation webhook). RUN-7266 Allocation bug - a researcher asked for 2 GPU for Interactive Job and other jobs received the allocated GPU within the same node RUN-8418 different user when submitting via runai cli and vi ui submit form RUN-6838 When submitting a job with port out of range, the job is submitted successfully however the submission actually fails. RUN-8196 Nodepools aren't visible in 2.9 UI. RUN-7435 Run:ai CLI submit doesn't parse correctly environment variables that end with a '='. RUN-8192 The UI shows a deleted job in the Current Jobs tab. RUN-7776 User does not exist in the UI due to pagination limitation."},{"location":"home/whats-new-2-12/","title":"Run:ai version 2.12","text":""},{"location":"home/whats-new-2-12/#version-2124","title":"Version 2.12.4","text":""},{"location":"home/whats-new-2-12/#release-date","title":"Release date","text":"<p>June 2023</p>"},{"location":"home/whats-new-2-12/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-10129 Fixed an issue where logs of completed trainings were not displayed in the UI."},{"location":"home/whats-new-2-12/#version-2123","title":"Version 2.12.3","text":""},{"location":"home/whats-new-2-12/#release-date_1","title":"Release date","text":"<p>May 2023</p> <ul> <li>Added an option in the UI to login as a local user in OpenShift environments.</li> </ul>"},{"location":"home/whats-new-2-12/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-10052 Fixed an issue where a UI change is needed to submit a workload after a template was used to create a job. RUN-10000 Fixed a UI issue that affected some users, where the job submission form wasn't working due to a missing value in a JWT token. RUN-9576 Added a missing endpoint (<code>projectDepartment</code>) to the API documentation site."},{"location":"home/whats-new-2-12/#version-2121","title":"Version 2.12.1","text":""},{"location":"home/whats-new-2-12/#release-date_2","title":"Release date","text":"<p>May 2023</p> <p>Improved the readability of unsuccessful node scheduling in the message log.</p>"},{"location":"home/whats-new-2-12/#fixed-issues_2","title":"Fixed issues","text":"Internal ID Description RUN-9326 Fixed an issue that affected the dashboard where projects created with fractional GPUs, display the number of GPUs rounded down to nearest whole number. RUN-9039 Fixed an issue that displays jobs as interactive-preemptible after cycling the <code>preemptible</code> flag from on to off."},{"location":"home/whats-new-2-12/#version-2120","title":"Version 2.12.0","text":""},{"location":"home/whats-new-2-12/#release-date_3","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-12/#release-content","title":"Release content","text":""},{"location":"home/whats-new-2-12/#compatibility","title":"Compatibility","text":"<ul> <li> <p>Removed support for OpenShift 4.8 and 4.9, and Kubernetes 1.21 and 1.22.</p> </li> <li> <p>Added support for OpenShift 4.12 and Kubernetes 1.27.</p> </li> <li> <p>The <code>runai top job</code> command was deprecated.</p> </li> </ul> <p>Single sign on</p> <p>When SSO is enabled, you can still create and authenticate with Local users. For configuration of local users and SSO users, see Create a new user.</p>"},{"location":"home/whats-new-2-12/#researcher-tools-enhancements","title":"Researcher tools enhancements","text":"<p>OpenShift Dev Spaces</p> <p>Added support fort Openshift Dev Spaces custom resource definitions using the RunAI scheduler.</p> <p>Training Experience</p> <p>Added a new feature that allows the researcher to provision a training job in a project using a wizard like flow.</p> <p>DeepSpeed Integration</p> <p>Added integration and certification with DeepSpeed for multi pod using open-mpi. See DeepSpeed Integration.</p> <p>Cluster API</p> <p>Added <code>Stop</code>, <code>Suspend</code>, and <code>Resume</code> jobs options to Submitting Workloads via YAML.</p> <p>Comet Integration</p> <p>Comet builds tools that help data scientists, engineers, and team leaders accelerate and optimize machine learning and deep learning models. This integration with Run:ai provides organizations of every size a platform to build better ML models faster. For more information, see Comet. For configuration information, see Comet integration.</p>"},{"location":"home/whats-new-2-12/#known-issues","title":"Known issues","text":"<p>None</p>"},{"location":"home/whats-new-2-12/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-6827 Fixed an issue where the ellipsis remains in the Dashboard when using Firefox after a long idle time. RUN-8621 Fixed the error response to 204 when changing to a custom logo. RUN-8662 Fixed grayed out submit button when using a template with pvc. RUN-8890 Fixed a scheduler panic when both a project and a department use the same name. RUN-9035 Fixed an issue that allowed a scheduler from any node pool to delete reservation pods created on a different node pool which may have caused a failure to schedule jobs with fractional GPU allocations. RUN-9089 Added the <code>port forward</code> CLI command. RUN-9166 Fixed incorrect response about resource availability in messages indicating why a specific job failed to schedule. RUN-9259 Fixed an issue where cluster sync requests are not working at scale due to a large number of requests."},{"location":"home/whats-new-2-13/","title":"Run:ai version 2.13","text":""},{"location":"home/whats-new-2-13/#version-2137","title":"Version 2.13.7","text":""},{"location":"home/whats-new-2-13/#release-date","title":"Release date","text":"<p>July 2023</p>"},{"location":"home/whats-new-2-13/#release-content","title":"Release content","text":"<ul> <li>Added filters to the historic quota ratio widget on the Quota management dashboard.</li> </ul>"},{"location":"home/whats-new-2-13/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-11080 Fixed an issue in OpenShift environments where log in via SSO with the <code>kubeadmin</code> user, gets blank pages for every page. RUN-11119 Fixed an issue where values that should be the Order of priority column are in the wrong column. RUN-11120 Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster. RUN-11121 Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form. RUN-11272 Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page. ## Version 2.13.4"},{"location":"home/whats-new-2-13/#release-date_1","title":"Release date","text":"<p>July 2023</p>"},{"location":"home/whats-new-2-13/#fixed-issues_1","title":"Fixed issues","text":"Internal ID Description RUN-11089 Fixed an issue when creating an environment, commands in the Runtime settings pane and are not persistent and cannot be found in other assets (for example in a new Training)."},{"location":"home/whats-new-2-13/#version-2131","title":"Version 2.13.1","text":""},{"location":"home/whats-new-2-13/#release-date_2","title":"Release date","text":"<p>July 2023</p>"},{"location":"home/whats-new-2-13/#release-content_1","title":"Release content","text":"<ul> <li>Made an improvement so that occurrences of labels that are not in use anymore are deleted.</li> </ul>"},{"location":"home/whats-new-2-13/#fixed-issues_2","title":"Fixed issues","text":"<p>N/A</p>"},{"location":"home/whats-new-2-13/#version-2130","title":"Version 2.13.0","text":""},{"location":"home/whats-new-2-13/#release-content_2","title":"Release content","text":"<p>This version contains features and fixes from previous versions starting with 2.9. Refer to the prior versions for specific features and fixes. For information about features, functionality, and fixed issues in previous versions see:</p> <ul> <li>What's new 2.12</li> <li>What's new 2.10</li> <li>What's new 2.9</li> </ul> <p>Projects</p> <ul> <li>Improved the Projects UI for ease of use. Projects follows UI upgrades and changes that are designed to make setting up of components and assets easier for administrators and researchers. To configure a project, see Projects.</li> </ul> <p>Dashboards</p> <ul> <li> <p>Added a new dashboard for Quota management, which provides an efficient means to monitor and manage resource utilization within the AI cluster. The dashboard filters the display of resource quotas based on Departments, Projects, and Node pools. For more information, see Quota management dashboard.</p> </li> <li> <p>Added to the Overview dashboard, the ability to filter the cluster by one or more node pools. For more information, see Node pools.</p> </li> </ul> <p>Nodes and Node pools</p> <ul> <li> <p>Run:ai scheduler supports 2 scheduling strategies: Bin Packing (default) and Spread. For more information, see Scheduling strategies. You can configure the scheduling strategy in the node pool level to improve the support of clusters with mixed types of resources and workloads. For configuration information, see Creating new node pools.</p> </li> <li> <p>GPU device level DCGM Metrics are collected per GPU and presented by Run:ai in the Nodes table. Each node contains a list of its embedded GPUs with their respective DCGM metrics. See DCGM Metrics for the list of metrics which are provided by NVidia DCGM and collected by Run:ai. Contact your Run:ai customer representative to enable this feature.</p> </li> </ul> <ul> <li>Added per node pool over-quota priority. Over-quota priority sets the relative amount of additional unused resources that an asset can get above its current quota. For more information, see Over-quota priority.</li> </ul> <ul> <li>Added support of associating workspaces to node pool. The association between workspaces and node pools is done using Compute resources section. In order to associate a compute resource to a node pool, in the Compute resource section, press More settings. Press Add new to add more node pools to the configuration. Drag and drop the node pools to set their priority.</li> </ul> <ul> <li>Added Node pool selection as part of the workload submission form. This allows researchers to quickly determine the list of node pools available and their priority. Priority is set by dragging and dropping them in the desired order of priority. In addition, when the node pool priority list is locked by a policy, the list isn't editable by the Researcher even if the workspace is created from a template or copied from another workspace.</li> </ul> <p>Time limit duration</p> <ul> <li> <p>Improved the behavior of any workload time limit (for example, Idle time limit) so that the time limit will affect existing workloads that were created before the time limit was configured. This is an optional feature which provides help in handling situations where researchers leave sessions open even when they do not need to access the resources. For more information, see Limit duration of interactive training jobs.</p> </li> <li> <p>Improved workspaces time limits. Workspaces that reach a time limit will now transition to a state of <code>stopped</code> so that they can be reactivated later.</p> </li> <li> <p>Added time limits for training jobs per project. Administrators (Department Admin, Editor) can limit the duration of Run:ai Training jobs per Project using a specified time limit value. This capability can assist administrators to limit the duration and resources consumed over time by training jobs in specific projects. Each training job that reaches this duration will be terminated.</p> </li> </ul> <p>Workload assets</p> <ul> <li>Extended the collaboration functionality for any workload asset such as Environment, Compute resource, and some Data source types. These assets are now shared with Departments in the organization in addition to being shared with specific projects, or the entire cluster.</li> </ul> <ul> <li>Added a search box for card galleries in any asset based workload creation form to provide an easy way to search for assets and resources. To filter use the asset name or one of the field values of the card.</li> </ul> <p>PVC data sources</p> <ul> <li>Added support for PVC block storage in the New data source form. In the New data source form for a new PVC data source, in the Volume mode field, select from Filesystem or Block. For more information, see Create a PVC data source.</li> </ul> <p>Credentials</p> <ul> <li>Added Docker registry to the Credentials menu. Users can create docker credentials for use in specific projects for image pulling. To configure credentials, see Configuring credentials.</li> </ul> <p>Policies</p> <ul> <li>Improved policy support by adding <code>DEFAULTS</code> in the <code>items</code> section in the policy. The <code>DEFAULTS</code> section sets the default behavior for items declared in this section. For example, this can be use to limit the submission of workloads only to existing PVCs. For more information and an example, see Policies, Complex values.</li> </ul> <ul> <li>Added support for making a PVC data source available to all projects. In the New data source form, when creating a new PVC data source, select All from the Project pane.</li> </ul> <p>Researcher API</p> <ul> <li>Extended researcher's API to allow stopping and starting of workloads using the API. For more information, see Submitting Workloads via HTTP/REST.</li> </ul> <p>Integrations</p> <ul> <li>Added support for Spark and Elastic jobs. For more information, see Running Spark jobs with Run:ai.</li> </ul> <ul> <li> <p>Added support for Ray jobs. Ray is an open-source unified framework for scaling AI and Python applications. For more information, see Integrate Run:ai with Ray.</p> </li> <li> <p>Added integration with Weights &amp; Biases Sweep to allow data scientists to submit hyperparameter optimization workloads directly from the Run:ai UI. To configure sweep, see Sweep configuration.</p> </li> </ul> <ul> <li>Added support for XGBoost. XGBoost, which stands for Extreme Gradient Boosting, is a scalable, distributed gradient-boosted decision tree (GBDT) machine learning library. It provides parallel tree boosting and is the leading machine learning library for regression, classification, and ranking problems. For more information, see runai submit-dist xgboost</li> </ul> <p>Compatability</p> <ul> <li>Added support for multiple OpenShift clusters. For configuration information, see Installing additional Clusters.</li> </ul>"},{"location":"home/whats-new-2-13/#installation","title":"Installation","text":"<ul> <li>The manual process of upgrading Kubernetes CRDs is no longer needed when upgrading to the most recent version (2.13) of Run:ai.</li> <li>From Run:ai 2.12 and above, the control-plane installation has been simplified and no longer requires the creation of a backend values file. Instead, install directly using <code>helm</code> as described in Install the Run:ai Control Plane.  </li> <li>From Run:ai 2.12 and above, the air-gapped, control-plane installation now generates a <code>custom-env.yaml</code> values file during the preparation stage. This is used when installing the control-plane.</li> </ul>"},{"location":"home/whats-new-2-13/#known-issues","title":"Known issues","text":"Internal ID Description RUN-11005 Incorrect error messages when trying to run <code>runai</code> CLI commands in an OpenShift environment. RUN-11009 Incorrect error message when a user without permissions to tries to delete another user."},{"location":"home/whats-new-2-13/#fixed-issues_3","title":"Fixed issues","text":"Internal ID Description RUN-9039 Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible. RUN-9323 Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful. RUN-9324 Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready. RUN-9902 Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn\u2019t have permissions to monitor the <code>runai</code> namespace after an installation or upgrade to 2.9. RUN-9920 Fixed an issue where the <code>canEdit</code> key in a policy is not validated properly for itemized fields when configuring an interactive policy. RUN-10052 Fixed an issue when loading a new job from a template gives an error until there are changes made on the form. RUN-10053 Fixed an issue where the Node pool column is unsearchable in the job list. RUN-10422 Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.). RUN-10500 Fixed an issue where jobs are shown as running even though they don't exist in the cluster. RUN-10813 Fixed an issue in adding a <code>data source</code> where the path is case sensitive and didn't allow uppercase."},{"location":"home/whats-new-2-8/","title":"Run:ai Version 2.8","text":""},{"location":"home/whats-new-2-8/#release-date","title":"Release Date","text":"<p>November 2022 </p>"},{"location":"home/whats-new-2-8/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-8/#node-pools","title":"Node Pools","text":"<p>Node Pools is a new method for managing GPU and CPU resources by grouping the resources into distinct pools. With node pools:</p> <ul> <li>The administrator allocates Project and Department resources from these pools to be used by Workloads. </li> <li>The administrator controls which workloads can use which resources, allowing an optimized utilization of resources according to customer's specific mode of operation. </li> </ul>"},{"location":"home/whats-new-2-8/#user-interface-enhancements","title":"User Interface Enhancements","text":"<ul> <li>The Departments screen has been revamped and new functionality added, including a new and clean look and feel, and improved search and filtering capabilities.</li> <li>The Jobs screen has been split into 2 tabs for ease of use:</li> <li>Current:  (the default tab) consists of all the jobs that currently exist in the cluster. </li> <li>History:  consists of all the jobs that have been deleted from the cluster. Deleting Jobs also deletes their Log (no change).</li> </ul>"},{"location":"home/whats-new-2-8/#installation-improvements","title":"Installation improvements","text":"<p>The Run:ai user interface requires a URL address to the Kubernetes cluster. The requirement is relevant for SaaS installation only. </p> <p>In previous versions of Run:ai the administrator should provide an IP address and Run:ai would automatically create a DNS entry for it and a matching trusted certificate. </p> <p>In version 2.8,  the default is for the Run:ai administrator to provide a DNS and a trusted certificate. </p> <p>The older option still exists but is being deprecated due to complexity.</p>"},{"location":"home/whats-new-2-8/#inference","title":"Inference","text":"<p>The Deployment details page now contains the URL for the Inference service </p>"},{"location":"home/whats-new-2-8/#hyperparameter-optimization-hpo","title":"Hyperparameter Optimization (HPO)","text":"<p>HPO Jobs are now presented as a single line in the Job List rather than a separate line per experiment. </p>"},{"location":"home/whats-new-2-8/#known-issues","title":"Known Issues","text":"Internal ID Description Workaround RUN-6236 The Run:ai access control system prevents setting a role of researcher together with ML engineer or researcher manager at the same time. However, using the UI you can select these two roles by clicking the text near the check None RUN-6218 When installing Run:ai on OpenShift a second time, oauth client secret is incorrect/not updated. As a result, login is not possible Can be performed via manual configuration. Please contact Run:ai support. RUN-6216 In the multi cluster overview, the allocated GPU in the table of each cluster is wrong. The correct number is in the overview dashboard. None RUN-6190 When deleting a cluster, there are leftover pods that are not deleted. No side effects on functionality. Delete the pods manually. RUN-5855 (SaaS version only) The new control plane, versioned 2.8 does not allow the creation of a new deployment on a cluster whose version is lower than 2.8. Upgrade your cluster to 2.8 RUN-5780 It is possible to change runai/node-pool label of a running pod. This is a wrong usage of the system and may cause unexpected behavior. None. RUN-5527 Idle allocated GPU metric is not displayed for MIG workloads in OpenShift. None RUN-5519 When selecting a Job, the GPU memory utilization metrics is not displayed on the right-hand side. This is an NVIDIA DCGM known bug (see:  https://github.com/NVIDIA/dcgm-exporter/issues/103 ) which has been fixed in a later version but was not yet included in the latest NVIDIA GPU Operator Install the suggested version as described by NVIDIA. RUN-5478 Dashboard panels of GPU Allocation/project and Allocated jobs per project metrics:  In rare cases, some metrics reflect the wrong number of GPUs None RUN-5444 Dynamic MIG feature does not work with A-100 with 80GB of memory. None RUN-5424 When a workload is selected in the job list, the GPU tab in the right panel, shows the details of the whole GPUs in the node, instead of the details of the GPUs used by the workload. None RUN-5226 In rare occasions, when there is more than 1 NVIDIA MIG workload, nvidia-smi command to one of the workloads will result with no devices. None RUN-6359 In rare cases, when using fractions and the kubelet service on the scheduled node is down (Kubernetes not running on node)the pending workload will never run, even when the IT problem is solved. Delete the job and re-submit the workload. RUN-6399 Requested GPUs are sometimes displayed in the Job list as 0 for distributed workloads. None. This is a display-only issue RUN-6400 On EKS (Amazon Kubernetes Server), when using runai CLI, every command response starts with an error. No functionality harm. None. The CLI functions as expected."},{"location":"home/whats-new-2-8/#fixed-issues","title":"Fixed issues","text":"Internal ID Description RUN-5676 When Interactive Jupyter notebook workloads that contain passwords are cloned, the password is exposed in the displayed CLI command. RUN-5457 When using the Home environment variable in conjunction with the ran-as-user option in the CLI, the Home environment variable is overwritten with the user's home directory. RUN-5370 It is possible to submit two jobs with the same node-port. RUN-5314 When you apply an inference deployment via a file, the allocated GPUs are displayed as 0 in the deployments list. RUN-5284 When workloads are deleted while the cluster synchronization is down, there might be a non-existent Job shown in the user interface. The Job cannot be deleted. RUN-5160 In some situations, when a Job is deleted, there may be leftover Kubernetes configMaps in the system RUN-5154 In some cases, an error \"failed to load data\" can be seen in the graphs showing on the Job sidebar. RUN-5145 The default Kubernetes \"priority Class\" for deployments is the same as the priority class for interactive jobs. RUN-5039 In some scenarios, Dashboards may show \"found duplicate series for the match group\" error RUN-4941 The scheduler is wrongly trying to schedule jobs on a node, where there are allocated GPU jobs at an \"ImagePullBackoff\" state. This causes an error of \"UnexpectedAdmissionError\" RUN-4574 The role \"Researcher Manager\" is not displayed in the access control list of projects. RUN-4554 Users are trying to login with single-sign-on get a \"review profile\" page. RUN-4464 Single HPO (hyperparameter optimization) workload is displayed in the Job list user interfgace as multiple jobs (one for every pod)."},{"location":"home/whats-new-2-9/","title":"Run:ai Version 2.9","text":""},{"location":"home/whats-new-2-9/#version-299","title":"Version 2.9.9","text":"Internal ID Description RUN-10333 Fixed an issue with allowing a fractional GPU value of 0 when submitting jobs via YAML. RUN-9920 Fixed an issue with policies where the <code>canEdit</code> rule is not validated properly for itemized fields. RUN-9912 Fixed an issue where <code>runai bash</code> does not wait for pods to be ready. RUN-9902 Fixed an issue with Prometheus permissions in OpenShift environments. RUN-9326 Fixed an issue that affected the dashboard where projects created with fractional GPUs, display the number of GPUs rounded down to nearest whole number."},{"location":"home/whats-new-2-9/#version-297","title":"Version 2.9.7","text":""},{"location":"home/whats-new-2-9/#release-date","title":"Release date","text":"<p>May 2023</p>"},{"location":"home/whats-new-2-9/#fixed-issues","title":"Fixed Issues","text":"Internal ID Description RUN-8989 Fixed openshift authentication for users lacking email so that they can submit jobs using the UI. RUN-9488 Fixed certificate error when retrieving dashboards in environments that are using a self-signed certificate."},{"location":"home/whats-new-2-9/#release-date_1","title":"Release Date","text":"<p>February 2023</p>"},{"location":"home/whats-new-2-9/#release-content","title":"Release Content","text":""},{"location":"home/whats-new-2-9/#authentication","title":"Authentication","text":"<p>OpenShift groups</p> <p>Ability to manage access control through IDP groups declaration - groups are managed from the OpenShift platform and integrated into Run:ai platform, as opposed to group management in vanilla k8s with SSO. OpenShift doesn\u2019t need any additional configuration as this comes built-in with regular installation or the upgrade option.</p> <p>UID/GID for SSO users</p> <p>When running a workload through the UI the Run:ai platform now automatically injects the UID and GID into the created container. This has changed from previous versions where the user would enter data in these fields manually. This is designed for environments where UIDs and GIDs are managed in an SSO server, and Run:ai is configured with SSO.   </p> <p>SSO: block access to Run:ai</p> <p>When configuring SSO in the Run:ai platform all users are assigned a new default role. It means an SSO user will not have any access to the Run:ai platform unless a manager explicitly assigns additional roles via the user or group management areas.</p> <p>Run CPU over-quota workloads</p> <p>Added support for CPU workloads to support over-quota - CPU resources fairness was added to the Run:ai scheduler in addition to the GPU fairness that is already supported. The updated fairness algorithm takes into account all resource types (GPU, CPU compute and CPU memory) and is supported regardless of node pool configuration. </p>"},{"location":"home/whats-new-2-9/#runai-workspaces","title":"Run:ai Workspaces","text":"<p>A Run:ai workspace is a simplified, efficient tool for researchers to conduct their experiments, build AI models, access standard MLOps tools, and collaborate with their peers.</p> <p>Run:ai workspaces abstract complex concepts related to running containerized workloads in a Kubernetes environment, such as networking, storage, and secrets, and are built from predefined abstracted setups, that ease and streamline the researcher AI models development. A workspace consists of container images, data sets, resource requests, and all the required tools for the research. They are quickly created with the workspace wizard. For more information see Workspaces.</p>"},{"location":"home/whats-new-2-9/#new-supported-tools-for-researchers","title":"New supported tools for researchers","text":"<p>As part of the introduction of Run:ai workspaces a few new development and research tools were added. The new supported tools are: RStudio, Visual Studio Code, Matlab and Weights and Biases (see full details). This is an addition to adding already supported tools, such as JupyterNotebook and TensorBoard to Run:ai workspaces.</p> <p>Weight and Biases</p> <p>Weights and Biases is a commercial tool that provides experiment tracking, model visualization, and collaboration for machine learning projects. It helps researchers and developers keep track of their experiments, visualize their results, and compare different models to make informed decisions. This integration provides data researchers with connectivity between the running Workspace in Run:ai and the relevant project for experiment tracking. For more information, see Weights and Biases</p> <p>Node pools enhancements</p> <p>Added additional support to multi-node pools. This new capability allows the researcher to specify a prioritized list of node pools for the Run:ai scheduler to use. Researchers now gain the flexibility to use multiple resource types and maximize the utilization of the system\u2019s GPU and CPU resources. Administrators now have the option to set a default Project (namespace) level with a prioritized list of node pools that a workload will use if the researcher did not set its own priorities.</p> <p>New nodes and node pools Screens</p> <p>Run:ai has revised the nodes table, adding new information fields and graphs. It is now easier to assess how resources are allocated and utilized. Run:ai has also added a new \u2018node pools\u2019 table where Administrators can add a new node pool, update, and delete an existing node pool. In addition, the node pools table presents a large number of metrics and details about each of the node pools. A set of graphs reflect the node pools\u2019 resource status over time according to different criteria.</p> <p>Consumption Dashboard</p> <p>Added a \u201cConsumption\u201d dashboard. When enabled by the \u201cShow Consumption Dashboard\u201d alpha flag under \u201cSettings\u201d, this dashboard allows the admin to review consumption patterns for GPUs, CPUs and RAM over time. You can segregate consumption by over or in-quota allocation in the project or department level. For more information, see Consumption dashboard.</p> <p>Event History (Audit Log UI)</p> <p>Added the option for Administrators to view the system\u2019s Audit Log via the Run:ai user interface. Configuration changes and other administrative operations (login/logout etc) are saved in an Audit Log facility. Administrators can browse through the Admin Log (Event History), download as a JSON or CSV, filter specific date periods, set multiple criteria filters, and decide which information fields to view.</p> <p>Idle jobs timeout policy</p> <p>Added an option \u2018Editor\u2019 so that Administrators can terminate idle workloads by setting the criteria of \u2018idle time\u2019 per project so that the editor can identify and terminate idle Training and Interactive (build) workloads. This is used for maximizing and maintaining system sanitation.</p>"},{"location":"home/whats-new-2-9/#installation-enhancements","title":"Installation Enhancements","text":""},{"location":"home/whats-new-2-9/#cluster-upgrade","title":"Cluster Upgrade","text":"<p>Cluster upgrade to 2.9 requires uninstalling and then installing. No data is lost during the process. For more information see cluster upgrade.</p> <p>Using an IP address for a cluster URL is no longer available in this version. You must use a domain name.</p>"},{"location":"home/whats-new-2-9/#cluster-prerequisites","title":"Cluster Prerequisites","text":"<ul> <li> <p>Prometheus is no longer installed together with Run:ai. You must install the Prometheus stack before installing Run:ai. This is designed for organizations that already have Prometheus installed in the cluster. The Run:ai installation configures the existing Prometheus with a custom set of rules designed to extract metrics from the cluster.</p> </li> <li> <p>NGINX is no longer installed together with Run:ai. You must install an Ingress controller before installing Run:ai. This is designed for organizations that already have an ingress controller installed. The Run:ai installation creates NGINX rules to work with the controller.</p> </li> <li> <p>List of Run:ai installation Prerequisites can be found here.</p> </li> <li> <p>The Run:ai installation now performs a series of checks to verify the installation's validity. When the installation is complete, verify by reviewing the following in the log file:</p> <ul> <li>Are all mandatory prerequisites met?</li> <li>Are optional prerequisites met?</li> <li>Does the cluster have connectivity to the Run:ai control plane?</li> <li>Does Run:ai support the underlying Kubernetes version?</li> </ul> </li> </ul>"},{"location":"home/whats-new-2-9/#control-plane-upgrade","title":"Control Plane Upgrade","text":"<p>A special process is required to upgrade the control-plane to version 2.9. </p>"},{"location":"home/whats-new-2-9/#control-plane-prerequisites","title":"Control plane Prerequisites","text":"<ul> <li> <p>Run:ai control plane installation no longer installs NGINX. You must pre-install an ingress controller.</p> </li> <li> <p>The default persistent storage is now a default storage class preconfigured in Kubernetes rather than the older NFS assumptions. NFS flags in <code>runai-adm</code> generate-values still exist for backward compatibility.</p> </li> </ul>"},{"location":"home/whats-new-2-9/#other","title":"Other","text":"<p>Cluster Wizard has been simplified for environments with multiple clusters   in a self-hosted configuration. Clusters are now easier to configure. Choose a cluster location: </p> <ul> <li>Same as Control Plane. </li> <li>Remote to Control Plane. </li> </ul>"},{"location":"home/whats-new-2-9/#new-supported-software","title":"New Supported Software","text":"<ul> <li>Run:ai now supports Kubernetes 1.25 and 1.26.</li> <li>Run:ai now supports OpenShift 4.11</li> <li>Run:ai now supports Dynamic MIG with NVIDIA H100 hardware</li> <li>The Run:ai command-line interface now supports Microsoft Windows. See Install the Run:ai Command-line Interface.</li> </ul>"},{"location":"home/whats-new-2-9/#known-issues","title":"Known Issues","text":"Internal ID Description Workaround RUN-7874 When a project is not connected to a namespace - new job returns \"malformed URL\" None RUN-7617 Cannot delete Node affinity from project after it was created Remove it using the API."},{"location":"home/whats-new-2-9/#fixed-issues_1","title":"Fixed Issues","text":"Internal ID Description RUN-7776 user does not exist in the UI due to pagination limitation RUN-6995 Group Mapping from SSO Group to Researcher Manager Role no working RUN-6460 S3 Fail (read/write in Jupyter notebook) RUN-6445 Project can be created with deleted node pool RUN-6400 EKS - Every command response in runai CLI starts with an error. No functionality harm RUN-6399 Requested GPU is always 0 for MPI jobs, making also other metrics wrong RUN-6359 Job gets UnexpectedAdmissionError race condition with Kubelet RUN-6272 runai pod which owner is not RunaiJob - Do not allow deletion, suspension, cloning RUN-6218 When installing Run:ai on OpenShift a second time, oauth client secret is incorrect/not updated RUN-6216 Multi cluster: allocated GPU is wrong as a result of metric not counting jobs in error RUN-6029 CLI Submit git sync severe bug RUN-6027 [Security Issue] Job submitted with github sync -- Password is displayed in the UI RUN-5822 Environment Variables in the UI do not honor the \"canRemove:false\" attribute in Policy RUN-5676 Security issue with \"Clone Job\" functionality RUN-5527 Metrics (MIG - OCP): GPU Idle Allocated GPUs show No Data RUN-5478 # of GPUs is higher than existing GPUs in the cluster RUN-5444 MIG doesn't work on A100 - 80GB RUN-5424 Deployment GPUs tab shows all the GPUs on the node instead of the ones in use by the deployment RUN-5370 Can submit job with the same node port + imagePullpolicy RUN-5226 MIG job can't see device after submitting a different mig job RUN-4869 S3 jobs run forever with NotReady state RUN-4244 Run:ai Alertmanager shows false positive errors on Agent"},{"location":"home/whats-new-2020/","title":"Whats New 2020","text":""},{"location":"home/whats-new-2020/#december-28th-2020","title":"December 28th, 2020","text":"<p>It is now possible to allocate a specific amount of GPU memory rather than use the fraction syntax. Use <code>--gpu-memory=5G</code>.</p>"},{"location":"home/whats-new-2020/#december-15th-2020","title":"December 15th, 2020","text":"<p>Project and Departments can now be set to not allocate resources beyond the assigned GPUs. This is useful for budget-conscious Projects/Departments. </p>"},{"location":"home/whats-new-2020/#december-1st-2020","title":"December 1st, 2020","text":"<p>New integration documents:</p> <ul> <li>Apache Airflow</li> <li>TensorBoard</li> </ul>"},{"location":"home/whats-new-2020/#november-25th-2020","title":"November 25th, 2020","text":"<p>Syntax changes in CLI:</p> <ul> <li><code>runai &lt;object&gt; list</code>  has been replaced by <code>runai list &lt;object&gt;</code>.</li> <li><code>runai get</code> has been replaced by <code>runai describe job</code>.</li> <li><code>runai &lt;object&gt; set</code> has been replaced by <code>runai config &lt;object&gt;</code>.</li> </ul> <p>The older style will still work with a deprecation notice.</p> <p><code>runai top node</code> has been revamped.</p>"},{"location":"home/whats-new-2020/#november-12th-2020","title":"November 12th, 2020","text":"<p>An Admin can now create templates for the Command-line interface. Both a default template and specific templates, that can be used with the --template flag. The new templates allow for mandatory values, defaults, and run-time environment variable resolution.</p> <p>It is now also possible to pass Secrets to Job. see here</p>"},{"location":"home/whats-new-2020/#november-2nd-2020","title":"November 2nd, 2020","text":"<p>Several changes and additions to the Command-line interface:</p> <ul> <li>Passing a command and arguments is now done docker-style by adding <code>--</code> at the end of the command</li> <li>You no longer need to provide a Job name. If you don't, a Job name will be generated automatically. You can also control the job-name prefix using an additional flag. </li> <li>New <code>--image-pull-policy</code> flag, allowing Researcher support for updating images without tagging.</li> </ul> <p>For further information see runai submit</p>"},{"location":"home/whats-new-2020/#september-6th-2020","title":"September 6th, 2020","text":"<p>We released a module that helps the Researcher perform Hyperparameter optimization (HPO). HPO is about running many smaller experiments with varying parameters to help determine the optimal parameter set Hyperparameter Optimization Quickstart</p>"},{"location":"home/whats-new-2020/#september-3rd-2020","title":"September 3rd, 2020","text":"<p>GPU Fractions now run in training and not only interactive. GPU Fractions training Job can be preempted, bin-packed and consolidated like any integer Job. See Run:ai Scheduler Fraction for more.</p>"},{"location":"home/whats-new-2020/#august-10th-2020","title":"August 10th, 2020","text":"<p>Run:ai Now supports Distributed Training and Gang Scheduling. For further information, see the Launch Distributed Training Workloads quickstart.</p>"},{"location":"home/whats-new-2020/#august-4th-2020","title":"August 4th, 2020","text":"<p>There is now an optional second level of Project hierarchy called Departments. For further information on how to configure and use Departments, see Working with Departments </p>"},{"location":"home/whats-new-2020/#july-28th-2020","title":"July 28th, 2020","text":"<p>You can now enforce a cluster-wise setting that mandates all containers running using the Run:ai CLI to run as non root. For further information, see Enforce non-root Containers</p>"},{"location":"home/whats-new-2020/#july-21th-2020","title":"July 21th, 2020","text":"<p>It is now possible to mount a Persistent Storage Claim using the Run:ai CLI. See the <code>--pvc</code> flag in the runai submit CLI flag</p>"},{"location":"home/whats-new-2020/#june-13th-2020","title":"June 13th, 2020","text":""},{"location":"home/whats-new-2020/#new-settings-for-the-allocation-of-cpu-and-memory","title":"New Settings for the Allocation of CPU and Memory","text":"<p>It is now possible to set limits for CPU and memory as well as to establish defaults based on the ratio of GPU to CPU and GPU to memory. </p> <p>For further information see: Allocation of CPU and Memory</p>"},{"location":"home/whats-new-2020/#june-3rd-2020","title":"June 3rd, 2020","text":""},{"location":"home/whats-new-2020/#node-group-affinity","title":"Node Group Affinity","text":"<p>Projects now support Node Affinity. This feature allows the Administrator to assign specific Projects to run only on specific nodes (machines). Example use cases:</p> <ul> <li>The Project team needs specialized hardware (e.g. with enough memory)</li> <li>The Project team is the owner of specific hardware which was acquired with a specialized budget</li> <li>We want to direct build/interactive workloads to work on weaker hardware and direct longer training/unattended workloads to faster nodes</li> </ul> <p>For further information see: Working with Projects</p>"},{"location":"home/whats-new-2020/#limit-duration-of-interactive-jobs","title":"Limit Duration of Interactive Jobs","text":"<p>Researchers frequently forget to close Interactive Job. This may lead to a waste of resources. Some organizations prefer to limit the duration of interactive Job and close them automatically. </p> <p>For further information on how to set up duration limits see: Working with Projects</p>"},{"location":"home/whats-new-2020/#may-24th-2020","title":"May 24th, 2020","text":""},{"location":"home/whats-new-2020/#kubernetes-operators","title":"Kubernetes Operators","text":"<p>Cluster installation now works with Kubernetes Operators. Operators make it easy to install, update, and delete a Run:ai cluster. </p> <p>For further information see: Upgrading a Run:ai Cluster Installation and Deleting a a Run:ai Cluster Installation</p>"},{"location":"home/whats-new-2020/#march-3rd-2020","title":"March 3rd, 2020","text":""},{"location":"home/whats-new-2020/#admin-overview-dashboard","title":"Admin Overview Dashboard","text":"<p>A new admin overview dashboard that shows a more holistic view of multiple clusters. Applicable for customers with more than one cluster.</p>"},{"location":"home/whats-new-2021/","title":"Whats New 2021","text":""},{"location":"home/whats-new-2021/#december-8th-2021","title":"December 8th 2021","text":"<p>To comply with organizational policies and enhance the Run:ai platform security, Run:ai now supports Single Sign-On (SSO). This functionality is currently in beta and is available for new customer tenants only. For further details on SSO see Single Sign-On.</p> <p>To optimize resource management and utilization of Nvidia GPUs based on Ampere architecture, such as A100, Run:ai now supports dynamic creation and allocation of MIG partitions. This functionality is currently in beta. For further details on the dynamic allocation of MIG partitions see Dynamic MIG.</p> <p>Run:ai now supports AI workloads running in containerized clusters based on the VMWare Tanzu orchestrator. For further details on supported orchestrators see the prerequisites document.</p> <p>Supportability enhancements:</p> <ul> <li>A new \"Status History\" tab has been added to the job details view. The new tab shows the details of each status change of each job and allows researchers to analyze how to improve experiments as well as equip administrators with a tool to analyze running and historical jobs. In addition, the details of the reason a job is in the current status are available when hovering over the job status on the jobs table.</li> <li>To improve the ability to monitor the Run:ai environment, Run:ai components now expose alerts indicating whether the component is running. For further details on cluster monitoring see Cluster Monitoring</li> </ul> <p>User Experience (UX) enhancements:</p> <ul> <li>Run:ai cluster version is now available in the clusters list.</li> <li>Researchers can now submit and integrate with Git directly from the user interface.</li> </ul>"},{"location":"home/whats-new-2021/#october-29th-2021","title":"October 29th 2021","text":"<p>The Run:ai cluster now enforces the access definitions of the user and lists only jobs under permitted projects. For example, <code>runai list jobs</code>  will only show jobs from projects to which the researcher has access to.</p> <p>The Run:ai CLI <code>runai list projects</code> option now displays the quota definitions of each project.</p> <p>The Run:ai CLI port forwarding option now supports any IP address.</p> <p>The Run:ai CLI binary download is now signed with a checksum, to allow customers to validate the origin of the CLI and align with security best practices and standards.</p> <p>The Run:ai Researcher User Interface now supports setting GPU Memory as well as volumes in NFS servers.</p> <p>The Run:ai metrics used in the Dashboards are now officially documented and can be accessed via APIs as documented here.</p> <p>Run:ai now officially supports integration with Seldon Core. For more details read here.</p> <p>Run:ai now support VMWare Tanzu Kubernetes.</p>"},{"location":"home/whats-new-2021/#august-30th-2021","title":"August 30th 2021","text":"<p>Run:ai now supports a self-hosted installation. With the self-hosted installation the Run:ai control-plane which typically resides on the cloud, is deployed at the customer's data center. For further details on  supported installation types see Installation Types.</p> <p>Note</p> <p>The Run:ai self-hosted installation requires a dedicated license, and has different pricing than the SaaS installation. For more details contact your Run:ai account manager.</p> <p>NFS volumes can now be mounted directly to containers run by Run:ai while submitting jobs via Run:ai. See the <code>--nfs-server</code> flag of runai submit.</p> <p>To ease the manageability of user templates, Run:ai now supports global user templates. Global user templates are user templates that are managed by the Run:ai administrator and are available for all the projects within a specific cluster. The purpose of global user templates is to help define and enforce cross-organization resource policies.</p> <p>To simplify researchers' job submission via the Run:ai Researcher User Interface (UI), the UI now supports autocomplete, which is based on pre-defined values, as configured by the Administrator using the administrative templates.</p> <p>Run:ai extended the usage of Cluster name, as defined by the Administrator while configuring clusters at Run:ai. The Cluster name is now populated to the Run:ai dashboards as well as the Researcher UI.</p> <p>The original command line, which was used for running a Job, is now shown under the Job details under the General tab.</p>"},{"location":"home/whats-new-2021/#august-4th-2021","title":"August 4th 2021","text":"<p>Researcher User Interface (UI) enhancements:</p> <ul> <li>Revised user interface and user experience</li> <li>Researchers can create templates for the ease of jobs submission. Templates can be saved and used at the project level</li> <li>Researchers can be easily re-submit jobs from the Submit page or directly from the jobs list on the Jobs page</li> <li>Administrators can create administrative templates which set cluster-wide defaults, constraints, and defaults for the submission of Jobs. </li> <li>Different teams can collaborate and share templates by exporting and importing templates in the Submit screen</li> </ul> <p>Researcher Command Line Interface (CLI) enhancements:</p> <ul> <li>Jobs can be manually suspended and resumed using the new commands: <code>runai suspend</code> &amp; <code>runai resume</code></li> <li>A new command was added: <code>runai top job</code></li> </ul> <p>Kubeflow integration is now supported. The new integration allows building ML pipelines in Kubeflow Pipelines as well as Kubeflow Notebooks and run the workloads via the Run:ai platform. For further details see Integrate Run:ai with Kubeflow.</p> <p>Mlflow integration is now supported. For further details see Integrate Run:ai with MLflow.</p> <p>Run:ai Projects are implemented as Kubernetes namespaces. Run:ai now supports customizable namespace names. For further details see Manual Creation of Namespaces.</p>"},{"location":"home/whats-new-2021/#may-10th-2021","title":"May 10th 2021","text":"<p>Usability improvements of Run:ai Command-line interface (CLI). The CLI now supports autocomplete for all options and parameters.</p> <p>Usability improvements of the Administration user interface navigation menu now allow for easier navigation.</p> <p>Run:ai can be installed when Kubernetes has Pod Security Policy (PSP) enabled.</p>"},{"location":"home/whats-new-2021/#april-20th-2021","title":"April 20th 2021","text":"<p>Job List and Node list now show the GPU type (e.g. v-100).</p>"},{"location":"home/whats-new-2021/#april-18th-2021","title":"April 18th, 2021","text":"<p>Inference workloads are now supported. For further details see Inference Overview.</p> <p>JupyterHub integration is now supported. For further details see JupyterHub Integration</p> <p>NVIDIA MIG is now supported. You can use the NVIDIA MIG technology to partition A-100 GPUs. Each partition will be considered as a single GPU by the Run:ai system and all the Run:ai functionality is supported in the partition level, including GPU Fractions.</p>"},{"location":"home/whats-new-2021/#april-1st-2021","title":"April 1st, 2021","text":"<p>Run:ai now supports Kubernetes 1.20</p>"},{"location":"home/whats-new-2021/#march-24th-2021","title":"March 24th 2021","text":"<p>Job List and Node list now show CPU utilization and CPU memory utilization.</p>"},{"location":"home/whats-new-2021/#february-14th-2021","title":"February 14th, 2021","text":"<p>The Job list now shows per-Job graphs for GPU utilization, GPU memory. </p> <p>The Node list now shows per-Node graphs for GPU utilization, GPU memory. </p>"},{"location":"home/whats-new-2021/#january-22nd-2021","title":"January 22nd, 2021","text":"<p>New Analytics dashboard with emphasis on CPU, CPU Memory, GPU, and GPU Memory. Allows better diagnostics of resource misuse. </p>"},{"location":"home/whats-new-2021/#january-15th-2021","title":"January 15th, 2021","text":"<p>A new developer documentation area has been created. In it:</p> <ul> <li>New documentation for Researcher REST API.</li> <li>New documentation for Administration Rest API.</li> <li>Kubernetes-based API for job creation.</li> </ul>"},{"location":"home/whats-new-2021/#january-9th-2021","title":"January 9th 2021","text":"<p>A new Researcher user interface is now available.</p>"},{"location":"home/whats-new-2021/#january-2nd-2021","title":"January 2nd, 2021","text":"<p>Run:ai Clusters now support Azure Managed Kubernetes Service (AKS)</p>"},{"location":"home/whats-new-2022/","title":"Whats New 2022","text":""},{"location":"home/whats-new-2022/#july-2022-runai-version-27","title":"July 2022 Run:ai Version 2.7","text":"<ul> <li>New Audit Log API is now available. The last login indication is now showing at the bottom left of the screen for single-sign-on users as well as regular users. </li> <li>Built-in Tensorboard support in the Run:ai user interface.</li> <li>You can now submit a Job and allocate Extended Kubernetes Resources. Extended resources are third-party devices (such as high-performance NICs, FPGAs, or InfiniBand adapters) that you want to allocate to your Job. The third-party vendor has extended Kubernetes using a Device Plugin. Run:ai now allows the allocation of these resources via the Run:ai user interface Job form as well as the Run:ai Workload API. </li> <li>You can now submit a job with additional Linux Capabilities. Linux capabilities allow the researcher to give the Job additional permissions without actually giving the Job root access to the node. Run:ai allows adding such capabilities to the Job via the Run:ai user interface Job form as well as the Run:ai Workload API.  </li> </ul>"},{"location":"home/whats-new-2022/#june-2022-runai-version-26-cloud-update-only","title":"June 2022 Run:ai Version 2.6 (Cloud update only)","text":"<ul> <li>The login screen now provides the capability to recover a password. </li> <li>With single-sign-on, you can now (optionally) map the user's first and last name from the organizational directory. See single-sign-on prerequisites</li> <li>A new user role of ML Engineer. The role allows the user to view and manage inference deployments and cluster resources. </li> <li>Clearer documentation on how to perform port-forwarding when accessing the Run:ai cluster from Windows.</li> <li>Using the Run:ai user interface it is now possible to clone an existing Job. The clone operation will open a Job form and allow you to change parameters before re-submitting. </li> </ul>"},{"location":"home/whats-new-2022/#may-2022-runai-version-25","title":"May 2022 Run:ai Version 2.5","text":"<ul> <li>Command-line interface installation The command-line interface utility is no longer a separate install. Instead is now installed by logging into the control plane and downloading the utility which matches the cluster's version. </li> </ul> <p>Warning</p> <p>The command-line interface utility for version 2.3 is not compatible with a cluster version of 2.5 or later. If you upgrade the cluster, you must also upgrade the command-line interface. </p> <ul> <li>Inference. Run:ai inference offering has been overhauled with the ability to submit deployments via the user interface and a new and consistent API. For more information see Inference overview. To enable the new inference module call by Run:ai customer support.</li> <li>CPU and CPU memory quotas can now be configured for projects and departments. These are hard quotas which means that the total amount of the requested resource for all workloads associated with a project/department cannot exceed the set limit. To enable this feature please call Run:ai customer support.</li> <li>Workloads. We have revamped the way Run:ai submits Jobs. Run:ai now submits Workloads. The change includes:<ul> <li>New Cluster API. The older API has been deprecated and remains for backward compatibility. The API creates all the resources required for the run, including volumes, services, and the like. It also deletes all resources when the workload itself is deleted. </li> <li>Administrative templates have been replaced with Policies. Policies apply across all ways to submit jobs: command-line, API, and user interface. </li> </ul> </li> <li><code>runai delete</code> has been changed in favor of <code>runai delete job</code> </li> <li>Self-hosted installation: The default OpenShift installation is now set to work with a configured Openshift IdP. See creation of backend values for more information. In addition, the default for OpenShift is now HTTPS.</li> <li>To send logs to Run:ai customer support there is a utility to package all logs into one tar file. Version 2.5 brings a new method that automatically sends all new logs to Run:ai support servers for a set amount of time. See collecting logs for more information.</li> <li>It is now possible to mount an S3 bucket into a Run:ai Job. The option is only available via the command-line interface. For more information see runai submit.</li> <li>User interface improvements: The top navigation bar of the Run:ai user interface has been improved and now allows users to easily access everything related to the account, as well as multiple helpful links to the product documentation, CLI and APIs. </li> <li>Researcher Authentication configuration is now mandatory. </li> </ul>"},{"location":"home/whats-new-2022/#newly-supported-versions","title":"Newly Supported Versions","text":"<ul> <li>Run:ai now supports Kubernetes 1.24</li> <li>Run:ai now supports OpenShift 4.10</li> <li>Distributed training now supports MPI version 0.3. Support for older versions of MPI has been removed. </li> </ul>"},{"location":"home/whats-new-2022/#april-2022-runai-version-24-controlled-release-only","title":"April 2022 Run:ai Version 2.4 (Controlled Release only)","text":""},{"location":"home/whats-new-2022/#important-upgrade-note","title":"Important Upgrade Note","text":"<p>This version contains a significant change in the way that Run:ai uses and installs NVIDIA pre-requisites. Prior to this version, Run:ai has installed its own variants of two NVIDIA components: NVIDIA device plugin and NVIDIA DCGM Exporter. </p> <p>As these two variants are no longer needed, Run:ai now uses the standard NVIDIA installation which makes the Run:ai installation experience simpler. It does however require non-trivial changes when upgrading from older versions of Run:ai. </p> <p>Going forward, we also mandate the usage of the NVIDIA GPU Operator version 1.9. The Operator easies the installation of all NVIDIA software. Drivers and Kubernetes components alike. </p> <p>For further information see the Run:ai NVIDIA prerequisites as well as the Run:ai cluster upgrade.</p>"},{"location":"home/whats-new-2022/#dynamic-mig-support","title":"Dynamic MIG Support","text":"<p>Run:ai now supports the dynamic allocation of NVIDIA MIG slices. For further information see the document on fractions as well as the dynamic MIG quickstart.</p> <p>Other features:</p> <ul> <li>Run:ai now support fractions on GKE. GKE has a different software stack for NVIDIA. To install Run:ai on GKE please contact Run:ai customer support. </li> </ul>"},{"location":"home/whats-new-2022/#march-2022-runai-version-23","title":"March 2022 Run:ai Version 2.3","text":""},{"location":"home/whats-new-2022/#important-upgrade-note_1","title":"Important Upgrade Note","text":"<p>To upgrade to version 2.3 cluster from earlier versions, you must uninstall version 2.2 or earlier and only then install version 2.3. For detailed information see cluster upgrade.</p>"},{"location":"home/whats-new-2022/#unified-user-interface","title":"Unified User Interface","text":"<p>The Researcher user interface and the Administrator user interface have been unified into a single unified Run:ai user interface. The new user interface is served from <code>https://&lt;company-name&gt;.run.ai</code>. The user interface capabilities are subject to the role of the individual user. </p> <ul> <li>See instructions on how to set up the unified user interface. </li> <li>See user interface Jobs area for a description of how to submit, view and delete Jobs from the unified user interface. </li> </ul> <p>Other features:</p> <ul> <li>Additional information about scheduler decisions can now be found as part of the Job's status. View the Job status by running runai describe job or selecting a Job in the user interface and clicking <code>Status History</code>.</li> <li>Run:ai now support Charmed Kubernetes. </li> <li>Run:ai now supports orchestration of containerized virtual machines via KubeVirt. For more information see KubeVirt support.</li> <li>Run:ai now supports OpenShift 4.9, Kubernetes 1.22, and 1.23.</li> </ul>"},{"location":"home/whats-new-2022/#february-2022-runai-version-22-cloud-update-only","title":"February 2022 Run:ai Version 2.2 (Cloud update only)","text":"<ul> <li>When enabling Single-Sign, you can now use role groups. With groups, you no longer need to provide roles to individuals. Rather, you can create a group in the organization's directory and assign its members with specific Run:ai Roles such as Administrator, Researcher, and the like. For more information see single-sign-on.</li> <li>REST API has changed. The new API relies on <code>Applications</code>. See Calling REST APIs for more information. </li> <li>Added a new user role <code>Research Manager</code>. The role automatically assigns the user as a Researcher to all projects, including future projects. </li> </ul>"},{"location":"home/whats-new-2022/#january-2022-runai-version-20","title":"January 2022 Run:ai Version 2.0","text":"<p>We have now stabilized on a single version numbering system for all Run:ai artifacts: </p> <ul> <li>Run:ai Control plane.</li> <li>Run:ai Cluster.</li> <li>Run:ai Command-line interface.</li> <li>Run:ai Administrator Command-line interface.</li> </ul> <p>Future versions will be numbered using 2 digits (2.0, 2.1, 2.2, etc.). The numbering for the different artifacts will vary at the third digit as we provide patches to customers. As such, in the future, the control plane can be tagged as 2.1.0 while the cluster tagged as 2.1.1.</p>"},{"location":"home/whats-new-2022/#release-contents","title":"Release Contents","text":"<ul> <li>To allow for better control over resource allocation, the Run:ai platform now provides the ability to define different over-quota priorities for projects. For full details see Controlling over-quota behavior.</li> <li>To help review and track resource consumption per department, the Department object was added to multiple dashboard metrics.</li> </ul> <p>Supportability enhancements:</p> <ul> <li>A new tool was added, to allow IT administrators to validate cluster and control-plane installation prerequisites. For full details see cluster installation prerequisites, Kubernetes self-hosted prerequisites or OpenShift self-hosted prerequisites.</li> <li>To better analyze scheduling issues, the node name was added to multiple scheduler log events.</li> </ul>"},{"location":"snippets/common-submit-cli-commands/","title":"Common submit cli commands","text":""},{"location":"snippets/common-submit-cli-commands/#naming-and-shortcuts","title":"Naming and Shortcuts","text":""},{"location":"snippets/common-submit-cli-commands/#-job-name-prefix-string","title":"--job-name-prefix <code>&lt;string&gt;</code>","text":"<p>The prefix to use to automatically generate a Job name with an incremental index. When a Job name is omitted Run:ai will generate a Job name. The optional <code>--job-name-prefix flag</code> creates Job names with the provided prefix.</p>"},{"location":"snippets/common-submit-cli-commands/#-name-string","title":"--name <code>&lt;string&gt;</code>","text":"<p>The name of the Job.</p>"},{"location":"snippets/common-submit-cli-commands/#-template-string","title":"--template <code>&lt;string&gt;</code>","text":"<p>Load default values from a workload.</p>"},{"location":"snippets/common-submit-cli-commands/#container-definition","title":"Container Definition","text":""},{"location":"snippets/common-submit-cli-commands/#-add-capability-stringarray","title":"--add-capability <code>&lt;stringArray&gt;</code>","text":"<p>Add linux capabilities to the container.</p>"},{"location":"snippets/common-submit-cli-commands/#-a-annotation-stringarray","title":"-a | --annotation <code>&lt;stringArray&gt;</code>","text":"<p>Set annotations variables in the container.</p>"},{"location":"snippets/common-submit-cli-commands/#-attach","title":"--attach","text":"<p>Default is false. If set to true, wait for the Pod to start running. When the pod starts running, attach to the Pod. The flag is equivalent to the command runai attach.</p> <p>The --attach flag also sets <code>--tty</code> and <code>--stdin</code> to true.</p>"},{"location":"snippets/common-submit-cli-commands/#-command","title":"--command","text":"<p>Overrides the image's entry point with the command supplied after '--'. When not using the <code>--command</code> flag, the entry point will not be overrided and the string after <code>--</code> will be appended as arguments to the entry point command.</p> <p>Example:</p> <p><code>--command -- run.sh 1 54</code> will start the docker and run <code>run.sh 1 54</code></p> <p><code>-- script.py 10000</code> will augment <code>script.py 10000</code> to the entry point command (e.g. <code>python</code>)</p>"},{"location":"snippets/common-submit-cli-commands/#-create-home-dir","title":"--create-home-dir","text":"<p>Create a temporary home directory for the user in the container. Data saved in this directory will not be saved when the container exits. For more information see non root containers.</p>"},{"location":"snippets/common-submit-cli-commands/#-e-stringarray-environment","title":"-e <code>&lt;stringArray&gt;  | --environment</code>`  <p>Define environment variables to be set in the container. To set multiple values add the flag multiple times (<code>-e BATCH_SIZE=50 -e LEARNING_RATE=0.2</code>).   </p>","text":""},{"location":"snippets/common-submit-cli-commands/#-image-string-i-string","title":"--image <code>&lt;string&gt;</code> | -i <code>&lt;string&gt;</code>  <p>Image to use when creating the container for this Job</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-image-pull-policy-string","title":"--image-pull-policy <code>&lt;string&gt;</code>  <p>Pulling policy of the image when starting a container. Options are:</p> <ul> <li><code>Always</code> (default): force image pulling to check whether local image already exists. If the image already exists locally and has the same digest, then the image will not be downloaded.</li> <li><code>IfNotPresent</code>: the image is pulled only if it is not already present locally.</li> <li><code>Never</code>: the image is assumed to exist locally. No attempt is made to pull the image.</li> </ul> <p>For more information see Kubernetes documentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-l-label-stringarray","title":"-l | --label <code>&lt;stringArray&gt;</code>  <p>Set labels variables in the container.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-preferred-pod-topology-key-string","title":"--preferred-pod-topology-key <code>&lt;string&gt;</code>  <p>If possible, all pods of this job will be scheduled onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-required-pod-topology-key-string","title":"--required-pod-topology-key <code>&lt;string&gt;</code>  <p>Enforce scheduling pods of this job onto nodes that have a label with this key and identical values.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-stdin","title":"--stdin  <p>Keep stdin open for the container(s) in the pod, even if nothing is attached.is attached.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-t-tty","title":"-t | --tty  <p>Allocate a pseudo-TTY.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-working-dir-string","title":"--working-dir <code>&lt;string&gt;</code>  <p>Starts the container with the specified directory as the current directory.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#resource-allocation","title":"Resource Allocation","text":""},{"location":"snippets/common-submit-cli-commands/#-cpu-double","title":"--cpu <code>&lt;double&gt;</code>  <p>CPU units to allocate for the Job (0.5, 1, .etc). The Job will receive at least this amount of CPU. Note that the Job will not be scheduled unless the system can guarantee this amount of CPUs to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-cpu-limit-double","title":"--cpu-limit <code>&lt;double&gt;</code>  <p>Limitations on the number of CPUs consumed by the Job (for example 0.5, 1). The system guarantees that this Job will not be able to consume more than this amount of CPUs.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-extended-resource","title":"--extended-resource `  <p>Request access to extended resource, syntax <code>&lt;resource-name&gt; = &lt; resource_quantity &gt;</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-g-gpu-float","title":"-g | --gpu <code>&lt;float&gt;</code>  <p>GPU units to allocate for the Job (0.5, 1).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-gpu-memory","title":"--gpu-memory  <p>GPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of GPU memory to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-memory-string","title":"--memory <code>&lt;string&gt;</code>  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The Job will receive at least this amount of memory. Note that the Job will not be scheduled unless the system can guarantee this amount of memory to the Job.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-memory-limit","title":"--memory-limit `  <p>CPU memory to allocate for this Job (1G, 20M, .etc). The system guarantees that this Job will not be able to consume more than this amount of memory. The Job will receive an error when trying to allocate more memory than this limit.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-mig-profile-string","title":"--mig-profile <code>&lt;string&gt;</code>  <p>MIG profile to allocate for the job (1g.5gb, 2g.10gb, 3g.20gb, 4g.20gb, 7g.40gb)</p>","text":""},{"location":"snippets/common-submit-cli-commands/#job-lifecycle","title":"Job Lifecycle","text":""},{"location":"snippets/common-submit-cli-commands/#-backoff-limit-int","title":"--backoff-limit <code>&lt;int&gt;</code>  <p>The number of times the Job will be retried before failing. The default is 6. This flag will only work with training workloads (when the <code>--interactive</code> flag is not specified).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#storage","title":"Storage","text":""},{"location":"snippets/common-submit-cli-commands/#-git-sync-stringarray","title":"--git-sync <code>&lt;stringArray&gt;</code>  <p>Clone a git repository into the container running the Job. The parameter should follow the syntax: <code>source=REPOSITORY,branch=BRANCH_NAME,rev=REVISION,username=USERNAME,password=PASSWORD,target=TARGET_DIRECTORY_TO_CLONE</code>.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-large-shm","title":"--large-shm  <p>Mount a large /dev/shm device.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-mount-propagation","title":"--mount-propagation  <p>Enable HostToContainer mount propagation for all container volumes</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-nfs-server-string","title":"--nfs-server <code>&lt;string&gt;</code>  <p>Use this flag to specify a default NFS host for --volume flag. Alternatively, you can specify NFS host for each volume individually (see --volume for details).</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-storage_class_namesizecontainer_mount_pathro","title":"--pvc <code>[Storage_Class_Name]:Size:Container_Mount_Path:[ro]</code>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-pvc_namecontainer_mount_pathro","title":"--pvc <code>Pvc_Name:Container_Mount_Path:[ro]</code>  <p>Mount a persistent volume claim into a container.</p>  <p>Note</p> <p>This option is being deprecated from version 2.10 and above. To mount existing or newly created Persistent Volume Claim (PVC), use the parameters <code>--pvc-exists</code> and <code>--pvc-new</code>.</p>  <p>The 2 syntax types of this command are mutually exclusive. You can either use the first or second form, but not a mixture of both.</p> <p>Storage_Class_Name is a storage class name that can be obtained by running <code>kubectl get storageclasses.storage.k8s.io</code>. This parameter may be omitted if there is a single storage class in the system, or you are using the default storage class. </p> <p>Size is the volume size you want to allocate. See Kubernetes documentation for how to specify volume sizes</p> <p>Container_Mount_Path. A path internal to the container where the storage will be mounted</p> <p>Pvc_Name. The name of a pre-existing Persistent Volume Claim to mount into the container</p> <p>Examples:</p>  <p><code>--pvc :3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the default Storage class. Mount it to <code>/tmp/john</code> as read-only </p> <p><code>--pvc my-storage:3Gi:/tmp/john:ro</code>  - Allocate <code>3GB</code> from the <code>my-storage</code> storage class. Mount it to /tmp/john as read-only </p> <p><code>--pvc :3Gi:/tmp/john</code> - Allocate <code>3GB</code> from the default storage class. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc:/tmp/john</code> - Use a Persistent Volume Claim named <code>my-pvc</code>. Mount it to <code>/tmp/john</code> as read-write </p> <p><code>--pvc my-pvc-2:/tmp/john:ro</code> - Use a Persistent Volume Claim named <code>my-pvc-2</code>. Mount it to <code>/tmp/john</code> as read-only</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-exists-string","title":"--pvc-exists <code>&lt;string&gt;</code>  <p>Mount a persistent volume. You must include a <code>claimname</code> and <code>path</code>.</p> <ul> <li>claim name\u2014The name of the persistent colume claim. Can be obtained by running </li> </ul> <p><code>kubectl get storageclasses.storage.k8s.io</code></p> <ul> <li>path\u2014the path internal to the container where the storage will be mounted</li> </ul> <p>Use the format:</p> <p><code>claimname=&lt;CLAIM_NAME&gt;,path=&lt;PATH&gt;</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-pvc-new-string","title":"--pvc-new  <code>&lt;string&gt;</code>  <p>Mount a persistent volume claim (PVC). If the PVC does not exist, it will be created based on the parameters entered. If a PVC exists, it will be used with its defined attributes and the parameters in the command will be ignored.</p> <ul> <li>claim name\u2014The name of the persistent colume claim.</li> <li>storage class\u2014A storage class name that can be obtained by running</li> </ul>  <p><code>kubectl get storageclasses.storage.k8s.io.</code></p> <p><code>storageclass</code> may be omitted if there is a single storage class in the system, or you are using the default storage class.</p>  <ul> <li>size\u2014The volume size you want to allocate for the PVC when creating it. See Kubernetes documentation to specify volume sizes.</li> <li>accessmode\u2014The description of thr desired volume capabilities for the PVC.</li> <li>ro\u2014Mount the PVC with read-only access.</li> <li>ephemeral\u2014The PVC will be created as volatile temporary storage which is only present during the running lifetime of the job.</li> </ul> <p>Use the format:</p> <p><code>storageclass=  &lt;storageclass&gt;,size= &lt;size&gt;, path= &lt;path&gt;, ro, accessmode-rwm</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-s3-string","title":"--s3 <code>&lt;string&gt;</code>  <p>Mount an S3 compatible storage into the container running the job. The parameter should follow the syntax:</p> <p><code>bucket=BUCKET,key=KEY,secret=SECRET,url=URL,target=TARGET_PATH</code></p> <p>All the fields, except url=URL, are mandatory. Default for url is</p> <p><code>url=https://s3.amazon.com</code></p>","text":""},{"location":"snippets/common-submit-cli-commands/#-v-volume-sourcecontainer_mount_pathronfs-host","title":"-v | --volume 'Source:Container_Mount_Path:[ro]:[nfs-host]'  <p>Volumes to mount into the container.</p> <p>Examples:</p> <p><code>-v /raid/public/john/data:/root/data:ro</code></p> <p>Mount /root/data to local path /raid/public/john/data for read-only access.</p> <p><code>-v /public/data:/root/data::nfs.example.com</code></p> <p>Mount /root/data to NFS path /public/data on NFS server nfs.example.com for read-write access.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#network","title":"Network","text":""},{"location":"snippets/common-submit-cli-commands/#-address-string","title":"--address <code>&lt;string&gt;</code>  <p>Comma separated list of IP addresses to listen to when running with --service-type portforward (default: localhost)</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-host-ipc","title":"--host-ipc  <p>Use the host's ipc namespace. Controls whether the pod containers can share the host IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores, and message queues. Shared memory segments are used to accelerate inter-process communication at memory speed, rather than through pipes or the network stack.</p> <p>For further information see docker run reference documentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-host-network","title":"--host-network  <p>Use the host's network stack inside the container. For further information see docker run referencedocumentation.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-port-stringarray","title":"--port <code>&lt;stringArray&gt;</code>  <p>Expose ports from the Job container.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-s-service-type-string","title":"-s | --service-type <code>&lt;string&gt;</code>  <p>External access type to interactive jobs. Options are: portforward, loadbalancer, nodeport, ingress.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#access-control","title":"Access Control","text":""},{"location":"snippets/common-submit-cli-commands/#-allow-privilege-escalation","title":"--allow-privilege-escalation  <p>Allow the job to gain additional privileges after start.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-run-as-user","title":"--run-as-user  <p>Run in the context of the current user running the Run:ai command rather than the root user. While the default container user is root (same as in Docker), this command allows you to submit a Job running under your Linux user. This would manifest itself in access to operating system resources, in the owner of new folders created under shared directories, etc. Alternatively, if your cluster is connected to Run:ai via SAML, you can map the container to use the Linux UID/GID which is stored in the organization's directory. For more information see non root containers.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#scheduling","title":"Scheduling","text":""},{"location":"snippets/common-submit-cli-commands/#-node-pools-string","title":"--node-pools <code>&lt;string&gt;</code>  <p>Instructs the scheduler to run this workload using specific set of nodes which are part of a Node Pool. You can specify one or more node pools to form a prioritized list of node pools that the scheduler will use to find one node pool that can provide the workload's specification. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group or use existing node labels, then create a node-pool and assign the label to the node-pool. This flag can be used in conjunction with node-type and Project-based affinity. In this case, the flag is used to refine the list of allowable node groups set from a node-pool. For more information see: Working with Projects.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-node-type-string","title":"--node-type <code>&lt;string&gt;</code>  <p>Allows defining specific Nodes (machines) or a group of Nodes on which the workload will run. To use this feature your Administrator will need to label nodes as explained here: Limit a Workload to a Specific Node Group.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-toleration-string","title":"--toleration <code>&lt;string&gt;</code>  <p>Specify one or more toleration criteria, to ensure that the workload is not scheduled onto an inappropriate node.  This is done by matching the workload tolerations to the taints defined for each node. For further details see Kubernetes Taints and Tolerations Guide.</p> <p>The format of the string:</p> <pre><code>operator=Equal|Exists,key=KEY,[value=VALUE],[effect=NoSchedule|NoExecute|PreferNoSchedule],[seconds=SECONDS]\n</code></pre>","text":""},{"location":"snippets/common-submit-cli-commands/#global-flags","title":"Global Flags","text":""},{"location":"snippets/common-submit-cli-commands/#-loglevel-string","title":"--loglevel (string)  <p>Set the logging level. One of: debug | info | warn | error (default \"info\")</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-project-p-string","title":"--project | -p (string)  <p>Specify the Project to which the command applies. Run:ai Projects are used by the scheduler to calculate resource eligibility. By default, commands apply to the default Project. To change the default Project use <code>runai config project &lt;project-name&gt;</code>.</p>","text":""},{"location":"snippets/common-submit-cli-commands/#-help-h","title":"--help | -h  <p>Show help text.</p>","text":""}]}
\ No newline at end of file
diff --git a/v2.13/sitemap.xml b/v2.13/sitemap.xml
index c062cc9717..abe6f14737 100644
--- a/v2.13/sitemap.xml
+++ b/v2.13/sitemap.xml
@@ -2,882 +2,882 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://docs.run.ai/v2.13/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/overview-researcher/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/use-cases/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-inference/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-mig/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/quickstart-overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build-ports/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-build/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-distributed-training/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-fractions/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-hpo/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-overquota/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-queue-fairness/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/Walkthroughs/walkthrough-train/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/best-practices/bare-metal-to-docker-images/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/best-practices/convert-to-unattended/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/best-practices/env-variables/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/best-practices/save-dl-checkpoints/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/Introduction/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-attach/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-bash/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-config/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-delete/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-describe/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-exec/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-list/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-login/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logout/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-logs/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-port-forwarding/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-resume/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-TF/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-mpi/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-pytorch/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit-dist-xgboost/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-submit/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-suspend/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-top-node/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-update/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-version/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/cli-reference/runai-whoami/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/allocation-of-cpu-and-memory/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/fractions/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/hpo/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/job-statuses/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/schedule-to-aws-groups/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/strategies/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/the-runai-scheduler/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/scheduling/using-node-pools/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/tools/dev-jupyter/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/tools/dev-pycharm/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/tools/dev-tensorboard/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/tools/dev-vscode/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/tools/dev-x11forward-pycharm/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/trainings/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/statuses/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/building-blocks/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/compute/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/datasources/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/blocks/environments/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-compute/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-ds/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/create-env/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/Researcher/user-interface/workspaces/create/workspace/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/overview-administrator/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/admin-ui-users/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/credentials-setup/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/dashboard-analysis/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/department-setup/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/deployments/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/jobs/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/admin-ui-setup/project-setup/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/airflow/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/argo-workflows/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/clearml/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/comet/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/deepspeed/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/jupyterhub/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/kubeflow/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/kubevirt/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/messaging/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/mlflow/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/ray/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/seldon/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/spark/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/integration/weights-and-biases/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/cli-install/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/cluster-wide-pvc/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/docker-registry-config/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/docker-to-runai/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/limit-to-node-group/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/registry-integration/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/researcher-setup/researcher-setup-intro/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/installation-types/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/try-azure/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/authentication/authentication-overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/authentication/researcher-authentication/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/authentication/sso/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-delete/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-install/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-prerequisites/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-setup-intro/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/cluster-upgrade/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/customize-cluster-install/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/dgx-bundle/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/cluster-setup/install-k8s/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/access-roles/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/allow-external-access-to-containers/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/cli-admin-install/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/dr/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/node-affinity-with-cloud-node-pools/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/node-roles/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/non-root-containers/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/config/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/maintenance/audit-log/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/maintenance/monitoring/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/maintenance/node-downtime/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/additional-clusters/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/backend/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/cluster/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/next-steps/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/preparations/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/prerequisites/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/project-management/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/uninstall/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/k8s/upgrade/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/additional-clusters/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/backend/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/cluster/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/next-steps/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/preparations/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/prerequisites/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/project-management/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/uninstall/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/runai-setup/self-hosted/ocp/upgrade/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/troubleshooting/cluster-health-check/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/troubleshooting/diagnostics/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/troubleshooting/troubleshooting/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/workloads/inference-overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/workloads/policies/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/workloads/secrets/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/admin/workloads/workload-overview-admin/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/overview-developer/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/rest-auth/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/admin-rest-api/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/other-resources/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/submit-cron-yaml/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/submit-rest/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/submit-yaml/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/workload-overview-dev/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/reference/distributed/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/reference/inference/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/reference/interactive/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/cluster-api/reference/training/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/inference/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/inference/setup/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/inference/submit-via-cli/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-kubernetes-api/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/k8s-api/launch-job-via-yaml/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/k8s-api/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/deprecated/researcher-rest-api/overview/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/developer/metrics/metrics/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/components/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/data-privacy-details/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2-10/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2-12/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2-13/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2-8/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2-9/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2020/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2021/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/home/whats-new-2022/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://docs.run.ai/v2.13/snippets/common-submit-cli-commands/</loc>
-         <lastmod>2023-07-26</lastmod>
+         <lastmod>2023-07-31</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/v2.13/sitemap.xml.gz b/v2.13/sitemap.xml.gz
index 07129bce16..11cef9cad6 100644
Binary files a/v2.13/sitemap.xml.gz and b/v2.13/sitemap.xml.gz differ
diff --git a/v2.13/snippets/common-submit-cli-commands/index.html b/v2.13/snippets/common-submit-cli-commands/index.html
index 0fb16a1567..a411d5d9b2 100644
--- a/v2.13/snippets/common-submit-cli-commands/index.html
+++ b/v2.13/snippets/common-submit-cli-commands/index.html
@@ -1,4 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/snippets/common-submit-cli-commands/ rel=canonical><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.4.3, mkdocs-material-9.1.19"><title>Common submit cli commands - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=https://docs.run.ai/v2.13/snippets/common-submit-cli-commands/ rel=canonical><link rel=icon href=../../images/favicon.ico><meta name=generator content="mkdocs-1.5.1, mkdocs-material-9.1.21"><title>Common submit cli commands - Run:ai Documentation Library</title><link rel=stylesheet href=../../assets/stylesheets/main.eebd395e.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../css/timeago.css><link rel=stylesheet href=../../stylesheets/extra.css><script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script><!-- Google Tag Manager --><script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
   new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
   j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
   'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);

Managed Entity / Roles	Admin	Dep. Admin	Editor	Research Manager	Researcher	ML Eng.	Viewer
Assign (Settings) Users/Groups/Apps to Roles	CRUD (all roles)	CRUD (Proj. Researchers and ML Engineers only)	N/A	N/A	N/A	N/A	N/A
Assign Users/Groups/Apps to Organizations	R (Projects, Departments)	CRUD (Projects only)	CRUD (Projects, Departments)	N/A	N/A	N/A	N/A
Departments	R	R	CRUD	N/A	N/A	R	R
Projects	R	CRUD	CRUD	R	R	R	R
Jobs	R	R	R	R	CRUD	N/A	R
Deployments	R	R	R	N/A	N/A	CRUD	R
Workspaces	R	R	R	R	CRUD	N/A	N/A
Environments	CRUD	CRUD	CRUD	CRUD	CRUD	N/A	N/A
Data Sources	CRUD	CRUD	CRUD	CRUD	CRUD	N/A	N/A
Compute Resources	CRUD	CRUD	CRUD	CRUD	CRUD	N/A	N/A
Templates	CRUD	CRUD	CRUD	CRUD	CRUD	N/A	N/A
Clusters	CRUD	N/A	R	N/A	N/A	R	R
Node Pools	CRUD	N/A	R	N/A	N/A	R	R
Nodes	R	N/A	R	N/A	N/A	R	R
Settings (General, Credentials)	CRUD	N/A	N/A	N/A	N/A	N/A	N/A
Events History	R	N/A	N/A	N/A	N/A	N/A	N/A
Dashboard.Overview	R	R	R	R	R	R	R
Dashboards.Analytics	R	R	R	R	R	R	R
Dashboards.Consumption	R	N/A	N/A	N/A	N/A	N/A	N/A
Prerequisite	Details
Kubernetes	Verify certified vendor and correct version.
NVIDIA GPU Operator	Different Kubernetes flavors have slightly different setup instructions. Verify correct version.
Ingress Controller	Install and configure NGINX (some Kubernetes flavors have NGINX pre-installed).
Prometheus	Install Prometheus.
Trusted domain name	You must provide a trusted domain name. Accessible only inside the organization
(Optional) Distributed Training	Install Kubeflow Training Operator if required.
(Optional) Inference	Some third party software needs to be installed to use the Run:ai inference module.
Kubernetes Distribution	Description	Installation Notes
Vanilla Kubernetes	Using no specific distribution but rather k8s native installation	See instructions for a simple (non-production-ready) Kubernetes Installation script.
OCP	OpenShift Container Platform	The Run:ai operator is certified for OpenShift by Red Hat.
EKS	Amazon Elastic Kubernetes Service
AKS	Azure Kubernetes Services
GKE	Google Kubernetes Engine
RKE	Rancher Kubernetes Engine	When installing Run:ai, select On Premise. RKE2 has a defect which requires a specific installation flow. Please contact Run:ai customer support for additional details.
Bright	NVIDIA Bright Cluster Manager	In addition, NVIDIA DGX comes bundled with Run:ai
Kubernetes Distribution	Description	Installation Notes
Ezmeral	HPE Ezmeral Container Platform	See Run:ai at Ezmeral marketplace
Tanzu	VMWare Kubernetes	Tanzu supports containerd rather than docker. See the NVIDIA prerequisites below as well as cluster customization for changes required for containerd
Run:ai version	Supported Kubernetes versions	Supported OpenShift versions
Run:ai 2.9	1.21 through 1.26	4.8 through 4.11
Run:ai 2.10	1.21 through 1.26 (see note below)	4.8 through 4.11
Run:ai 2.12	1.23 through 1.27 (see note below)	4.10 through 4.12
Run:ai 2.13	1.23 through 1.27 (see note below)	4.10 through 4.12
Name	Description	URLs	Ports
Run:ai Repository	Run:ai Helm Package Repository	runai-charts.storage.googleapis.com	443
Docker Images Repository	Run:ai images	gcr.io/run-ai-prod	443
Docker Images Repository	Third party Images	hub.docker.com quay.io	443
Run:ai	Run:ai Cloud instance	app.run.ai	443, 53
Name	Description	URLs	Ports
Grafana	Grafana Metrics Server	prometheus-us-central1.grafana.net and runailabs.com	443
Run:ai	Run:ai Cloud instance	app.run.ai	443, 53
Internal ID	Description
RUN-11005	Incorrect error messages when trying to run `runai` CLI commands in an OpenShift environment.
RUN-11009	Incorrect error message when a user without permissions to tries to delete another user.
Internal ID	Description
RUN-9039	Fixed an issue where in the new job screen, after toggling off the preemptible flag, and a job is submitted, the job still shows as preemptible.
RUN-9323	Fixed an issue with a non-scaleable error message when scheduling hundreds of nodes is not successful.
RUN-9324	Fixed an issue where the scheduler did not take into consideration the amount of storage so there is no explanation that pvc is not ready.
RUN-9902	Fixed an issue in OpenShift environments, where there are no metrics in the dashboard because Prometheus doesn’t have permissions to monitor the `runai` namespace after an installation or upgrade to 2.9.
RUN-9920	Fixed an issue where the `canEdit` key in a policy is not validated properly for itemized fields when configuring an interactive policy.
RUN-10052	Fixed an issue when loading a new job from a template gives an error until there are changes made on the form.
RUN-10053	Fixed an issue where the Node pool column is unsearchable in the job list.
RUN-10422	Fixed an issue where node details show running workloads that were actually finished (successfully/failed/etc.).
RUN-10500	Fixed an issue where jobs are shown as running even though they don't exist in the cluster.
RUN-10813	Fixed an issue in adding a `data source` where the path is case sensitive and didn't allow uppercase.
Internal ID	Description
RUN-11080	Fixed an issue in OpenShift environments where log in via SSO with the `kubeadmin` user, gets blank pages for every page.
RUN-11119	Fixed an issue where values that should be the Order of priority column are in the wrong column.
RUN-11120	Fixed an issue where the Projects table does not show correct metrics when Run:ai version 2.13 is paired with a Run:ai 2.8 cluster.
RUN-11121	Fixed an issue where the wrong over quota memory alert is shown in the Quota management pane in project edit form.
RUN-11272	Fixed an issue in OpenShift environments where the selection in the cluster drop down in the main UI does not match the cluster selected on the login page.
## Version 2.13.4