diff --git a/.github/workflows/github_page.yaml b/.github/workflows/github_page.yaml deleted file mode 100644 index 483c2adc..00000000 --- a/.github/workflows/github_page.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: GitHub Pages - -on: - workflow_dispatch: - -jobs: - deploy: - runs-on: ubuntu-22.04 - permissions: - contents: write - concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - steps: - - uses: actions/checkout@v3 - with: - ref: gallery - - - name: Setup Node - uses: actions/setup-node@v4 - with: - node-version: 20 - - - run: npm install - - run: npm run build - - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./build diff --git a/.github/workflows/pages-deploy.yml b/.github/workflows/pages-deploy.yml new file mode 100644 index 00000000..0f65155d --- /dev/null +++ b/.github/workflows/pages-deploy.yml @@ -0,0 +1,71 @@ +name: "Deploy Tutorial" +on: + push: + branches: + - main + paths-ignore: + - .gitignore + - README.md + - LICENSE + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +# Allow one concurrent deployment +concurrency: + group: "pages" + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Pages + id: pages + uses: actions/configure-pages@v4 + + - name: Setup Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 3.3 + bundler-cache: true + + - name: Install Dependencies + run: | + cd ./docs/tutorial + bundle install + + - name: Build site + run: | + cd ./docs/tutorial + bundle exec jekyll b -d "_site" + env: + JEKYLL_ENV: "production" + + - name: Upload site artifact + uses: actions/upload-pages-artifact@v3 + with: + path: "./docs/tutorial/_site" + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 + diff --git a/.gitignore b/.gitignore index 04b419aa..d8baefd2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ __pycache__/ # Distribution / packaging .Python build/ +_site develop-eggs/ dist/ downloads/ diff --git a/docs/tutorial/.nojekyll b/docs/tutorial/.nojekyll new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/docs/tutorial/.nojekyll @@ -0,0 +1 @@ + diff --git a/docs/tutorial/Gemfile b/docs/tutorial/Gemfile new file mode 100644 index 00000000..dbd38721 --- /dev/null +++ b/docs/tutorial/Gemfile @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +source "https://rubygems.org" + +gem "jekyll-theme-chirpy", "~> 7.1", ">= 7.1.1" + +gem "html-proofer", "~> 5.0", group: :test + +platforms :mingw, :x64_mingw, :mswin, :jruby do + gem "tzinfo", ">= 1", "< 3" + gem "tzinfo-data" +end + +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] diff --git a/docs/tutorial/_config.yml b/docs/tutorial/_config.yml new file mode 100644 index 00000000..2e87c334 --- /dev/null +++ b/docs/tutorial/_config.yml @@ -0,0 +1,226 @@ +# The Site Configuration + +# Import the theme +theme: jekyll-theme-chirpy + +# The language of the webpage › http://www.lingoes.net/en/translator/langcode.htm +# If it has the same name as one of the files in folder `_data/locales`, the layout language will also be changed, +# otherwise, the layout language will use the default value of 'en'. +lang: en-customized + +# Change to your timezone › https://kevinnovak.github.io/Time-Zone-Picker +timezone: + +# jekyll-seo-tag settings › https://github.com/jekyll/jekyll-seo-tag/blob/master/docs/usage.md +# ↓ -------------------------- + +title: Open-Sora Tutorial # main title + +tagline: Let's train a T2V model from scratch. # subtitle + +description: >- # used by seo meta and the atom feed + Open-Sora Tutorial: Let's train a T2V model from scratch. + +# Fill in the protocol & hostname for your site. +# E.g. 'https://username.github.io', note that it does not end with a '/'. +url: "https://LambdaLabsML.github.io" + +github: + username: LambdaLabsML # change to your GitHub username + +#twitter: +# username: twitter_username # change to your Twitter username + +social: + # Change to your full name. + # It will be displayed as the default author of the posts and the copyright owner in the Footer + name: ML Team @ LambdaLabsML + email: example@domain.com # change to your email address + links: + # The first element serves as the copyright owner's link + - https://github.com/LambdaLabsML # change to your GitHub homepage + #- https://twitter.com/username # change to your Twitter homepage + # Uncomment below to add more social links + # - https://www.facebook.com/username + # - https://www.linkedin.com/in/username + +# Site Verification Settings +webmaster_verifications: + google: # fill in your Google verification code + bing: # fill in your Bing verification code + alexa: # fill in your Alexa verification code + yandex: # fill in your Yandex verification code + baidu: # fill in your Baidu verification code + facebook: # fill in your Facebook verification code + +# ↑ -------------------------- +# The end of `jekyll-seo-tag` settings + +# Web Analytics Settings +analytics: + google: + id: # fill in your Google Analytics ID + goatcounter: + id: # fill in your GoatCounter ID + umami: + id: # fill in your Umami ID + domain: # fill in your Umami domain + matomo: + id: # fill in your Matomo ID + domain: # fill in your Matomo domain + cloudflare: + id: # fill in your Cloudflare Web Analytics token + fathom: + id: # fill in your Fathom Site ID + +# Page views settings +pageviews: + provider: # now only supports 'goatcounter' + +# Prefer color scheme setting. +# +# Note: Keep empty will follow the system prefer color by default, +# and there will be a toggle to switch the theme between dark and light +# on the bottom left of the sidebar. +# +# Available options: +# +# light — Use the light color scheme +# dark — Use the dark color scheme +# +theme_mode: # [light | dark] + +# The CDN endpoint for media resources. +# Notice that once it is assigned, the CDN url +# will be added to all media resources (site avatar, posts' images, audio and video files) paths starting with '/' +# +# e.g. 'https://cdn.com' +cdn: + +# the avatar on sidebar, support local or CORS resources +avatar: + +# The URL of the site-wide social preview image used in SEO `og:image` meta tag. +# It can be overridden by a customized `page.image` in front matter. +social_preview_image: # string, local or CORS resources + +# boolean type, the global switch for TOC in posts. +toc: true + +comments: + # Global switch for the post-comment system. Keeping it empty means disabled. + provider: # [disqus | utterances | giscus] + # The provider options are as follows: + disqus: + shortname: # fill with the Disqus shortname. › https://help.disqus.com/en/articles/1717111-what-s-a-shortname + # utterances settings › https://utteranc.es/ + utterances: + repo: # / + issue_term: # < url | pathname | title | ...> + # Giscus options › https://giscus.app + giscus: + repo: # / + repo_id: + category: + category_id: + mapping: # optional, default to 'pathname' + strict: # optional, default to '0' + input_position: # optional, default to 'bottom' + lang: # optional, default to the value of `site.lang` + reactions_enabled: # optional, default to the value of `1` + +# Self-hosted static assets, optional › https://github.com/cotes2020/chirpy-static-assets +assets: + self_host: + enabled: # boolean, keep empty means false + # specify the Jekyll environment, empty means both + # only works if `assets.self_host.enabled` is 'true' + env: # [development | production] + +pwa: + enabled: true # The option for PWA feature (installable) + cache: + enabled: true # The option for PWA offline cache + # Paths defined here will be excluded from the PWA cache. + # Usually its value is the `baseurl` of another website that + # shares the same domain name as the current website. + deny_paths: + # - "/example" # URLs match `/example/*` will not be cached by the PWA + +#paginate: 10 + +# The base URL of your site +baseurl: "/Open-Sora" + +# ------------ The following options are not recommended to be modified ------------------ + +kramdown: + footnote_backlink: "↩︎" + syntax_highlighter: rouge + syntax_highlighter_opts: # Rouge Options › https://github.com/jneen/rouge#full-options + css_class: highlight + # default_lang: console + span: + line_numbers: false + block: + line_numbers: true + start_line: 1 + +collections: + tabs: + output: true + sort_by: order + +defaults: + - scope: + path: "" # An empty string here means all files in the project + type: posts + values: + layout: post + comments: true # Enable comments in posts. + toc: true # Display TOC column in posts. + # DO NOT modify the following parameter unless you are confident enough + # to update the code of all other post links in this project. + permalink: /posts/:title/ + - scope: + path: _drafts + values: + comments: false + - scope: + path: "" + type: tabs # see `site.collections` + values: + layout: page + permalink: /:title/ + +sass: + style: compressed + sass_dir: _sass + +compress_html: + clippings: all + comments: all + endings: all + profile: false + blanklines: false + ignore: + envs: [development] + +exclude: + - "*.gem" + - "*.gemspec" + - docs + - tools + - README.md + - LICENSE + - "*.config.js" + - package*.json + +#jekyll-archives: +# enabled: [categories, tags] +# layouts: +# category: category +# tag: tag +# permalinks: +# tag: /tags/:name/ +# category: /categories/:name/ diff --git a/docs/tutorial/_data/contact.yml b/docs/tutorial/_data/contact.yml new file mode 100644 index 00000000..444e7cb5 --- /dev/null +++ b/docs/tutorial/_data/contact.yml @@ -0,0 +1,40 @@ +# The contact options. + +- type: github + icon: "fab fa-github" + +#- type: twitter +# icon: "fa-brands fa-x-twitter" + +#- type: email +# icon: "fas fa-envelope" +# noblank: true # open link in current tab + +#- type: rss +# icon: "fas fa-rss" +# noblank: true +# Uncomment and complete the url below to enable more contact options +# +# - type: mastodon +# icon: 'fab fa-mastodon' # icons powered by +# url: '' # Fill with your Mastodon account page, rel="me" will be applied for verification +# +# - type: linkedin +# icon: 'fab fa-linkedin' # icons powered by +# url: '' # Fill with your Linkedin homepage +# +# - type: stack-overflow +# icon: 'fab fa-stack-overflow' +# url: '' # Fill with your stackoverflow homepage +# +# - type: bluesky +# icon: 'fa-brands fa-bluesky' +# url: '' # Fill with your Bluesky profile link +# +# - type: reddit +# icon: 'fa-brands fa-reddit' +# url: '' # Fill with your Reddit profile link +# +# - type: threads +# icon: 'fa-brands fa-threads' +# url: '' # Fill with your Threads profile link diff --git a/docs/tutorial/_data/locales/en-customized.yml b/docs/tutorial/_data/locales/en-customized.yml new file mode 100644 index 00000000..a9b16b0d --- /dev/null +++ b/docs/tutorial/_data/locales/en-customized.yml @@ -0,0 +1,91 @@ +# The layout text of site + +# ----- Commons label ----- + +layout: + post: Post + category: Category + tag: Tag + +# The tabs of sidebar +tabs: + # format: : + home: Tutorial + categories: Categories + tags: Tags + archives: Archives + about: About + +# the text displayed in the search bar & search results +search: + hint: search + cancel: Cancel + no_results: Oops! No results found. + +panel: + lastmod: Recently Updated + trending_tags: Trending Tags + toc: Contents + +copyright: + # Shown at the bottom of the post + license: + template: #This post is licensed under :LICENSE_NAME by the author. + name: #CC BY 4.0 + link: #https://creativecommons.org/licenses/by/4.0/ + + # Displayed in the footer + brief: #Some rights reserved. + verbose: #>- + #Except where otherwise noted, the blog posts on this site are licensed + #under the Creative Commons Attribution 4.0 International (CC BY 4.0) License by the author. + +meta: #Using the :PLATFORM theme :THEME + +not_found: + statment: Sorry, we've misplaced that URL or it's pointing to something that doesn't exist. + +notification: + update_found: A new version of content is available. + update: Update + +# ----- Posts related labels ----- + +post: + written_by: By + posted: Posted + updated: Updated + words: words + pageview_measure: views + read_time: + unit: min + prompt: read + relate_posts: Further Reading + share: Share + button: + next: Newer + previous: Older + copy_code: + succeed: Copied! + share_link: + title: Copy link + succeed: Link copied successfully! + +# Date time format. +# See: , +df: + post: + strftime: "%b %e, %Y" + dayjs: "ll" + archives: + strftime: "%b" + dayjs: "MMM" + +# categories page +categories: + category_measure: + singular: category + plural: categories + post_measure: + singular: post + plural: posts \ No newline at end of file diff --git a/docs/tutorial/_data/share.yml b/docs/tutorial/_data/share.yml new file mode 100644 index 00000000..6f975680 --- /dev/null +++ b/docs/tutorial/_data/share.yml @@ -0,0 +1,50 @@ +# Sharing options at the bottom of the post. +# Icons from + +platforms: + - type: Twitter + icon: "fa-brands fa-square-x-twitter" + link: "https://twitter.com/intent/tweet?text=TITLE&url=URL" + + - type: Facebook + icon: "fab fa-facebook-square" + link: "https://www.facebook.com/sharer/sharer.php?title=TITLE&u=URL" + + - type: Telegram + icon: "fab fa-telegram" + link: "https://t.me/share/url?url=URL&text=TITLE" + + # Uncomment below if you need to. + # + # - type: Linkedin + # icon: "fab fa-linkedin" + # link: "https://www.linkedin.com/sharing/share-offsite/?url=URL" + # + # - type: Weibo + # icon: "fab fa-weibo" + # link: "https://service.weibo.com/share/share.php?title=TITLE&url=URL" + # + # - type: Mastodon + # icon: "fa-brands fa-mastodon" + # # See: https://github.com/justinribeiro/share-to-mastodon#properties + # instances: + # - label: mastodon.social + # link: "https://mastodon.social/" + # - label: mastodon.online + # link: "https://mastodon.online/" + # - label: fosstodon.org + # link: "https://fosstodon.org/" + # - label: photog.social + # link: "https://photog.social/" + # + # - type: Bluesky + # icon: "fa-brands fa-bluesky" + # link: "https://bsky.app/intent/compose?text=TITLE%20URL" + # + # - type: Reddit + # icon: "fa-brands fa-square-reddit" + # link: "https://www.reddit.com/submit?url=URL&title=TITLE" + # + # - type: Threads + # icon: "fa-brands fa-square-threads" + # link: "https://www.threads.net/intent/post?text=TITLE%20URL" diff --git a/docs/tutorial/_includes/favicons.html b/docs/tutorial/_includes/favicons.html new file mode 100644 index 00000000..e32e9941 --- /dev/null +++ b/docs/tutorial/_includes/favicons.html @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/docs/tutorial/_includes/sidebar.html b/docs/tutorial/_includes/sidebar.html new file mode 100644 index 00000000..2fac3b16 --- /dev/null +++ b/docs/tutorial/_includes/sidebar.html @@ -0,0 +1,81 @@ + + + + \ No newline at end of file diff --git a/docs/tutorial/_plugins/details_tag.rb b/docs/tutorial/_plugins/details_tag.rb new file mode 100644 index 00000000..dccfad16 --- /dev/null +++ b/docs/tutorial/_plugins/details_tag.rb @@ -0,0 +1,24 @@ +module Jekyll + module Tags + class DetailsTag < Liquid::Block + + def initialize(tag_name, markup, tokens) + super + @caption = markup + end + + def render(context) + site = context.registers[:site] + converter = site.find_converter_instance(::Jekyll::Converters::Markdown) + # below Jekyll 3.x use this: + # converter = site.getConverterImpl(::Jekyll::Converters::Markdown) + caption = converter.convert(@caption).gsub(/<\/?p[^>]*>/, '').chomp + body = converter.convert(super(context)) + "
#{caption}#{body}
" + end + + end + end + end + + Liquid::Template.register_tag('details', Jekyll::Tags::DetailsTag) \ No newline at end of file diff --git a/docs/tutorial/_plugins/posts-lastmod-hook.rb b/docs/tutorial/_plugins/posts-lastmod-hook.rb new file mode 100644 index 00000000..1fd6ecf9 --- /dev/null +++ b/docs/tutorial/_plugins/posts-lastmod-hook.rb @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby +# +# Check for changed posts + +Jekyll::Hooks.register :posts, :post_init do |post| + + commit_num = `git rev-list --count HEAD "#{ post.path }"` + + if commit_num.to_i > 1 + lastmod_date = `git log -1 --pretty="%ad" --date=iso "#{ post.path }"` + post.data['last_modified_at'] = lastmod_date + end + +end diff --git a/docs/tutorial/_posts/.placeholder b/docs/tutorial/_posts/.placeholder new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/docs/tutorial/_posts/.placeholder @@ -0,0 +1 @@ + diff --git a/docs/tutorial/_tabs/dataset.md b/docs/tutorial/_tabs/dataset.md new file mode 100644 index 00000000..46e5ac87 --- /dev/null +++ b/docs/tutorial/_tabs/dataset.md @@ -0,0 +1,113 @@ +--- +layout: post +icon: fas fa-database +title: Dataset +date: 2024-10-16 +toc: true +order: 4 +--- + +# Downloading & Preprocessing +Let's face it: video data is not easily accessible, and there aren't many publicly available sources. In this section, we'll guide you through downloading the necessary datasets, preprocessing the data, and ensuring it's ready for training the Open-Sora model. + +> **Note:** Ensure you have sufficient storage space and bandwidth to download these large datasets. The total required disk space is `~37TB`. +{: .prompt-tip} + +## **Download the Datasets** +We'll be using two primary datasets for our reproduction experiment: +* **[OpenVid](https://github.com/NJU-PCALab/OpenVid-1M)**: Contains 1 million short video clips and corresponding captions. + You can download the dataset from their [Huggingface link](https://huggingface.co/datasets/nkp37/OpenVid-1M). +* **[MiraData](https://github.com/mira-space/MiraData)**: Contains 330k (but different splits exists too) long video clips and corresponding captions. + For MiraData, we'll follow the guidance from the author's [repository](https://github.com/mira-space/MiraData/tree/v1?tab=readme-ov-file#download) to download the **330K** version of the dataset (the meta file we use is `miradata_v1_330k.csv`). +* **Custom Dataset**: Our guide also covers how to use your own video data set consisting of video clips and corresponding captions. + +**Dataset Summary** + +Dataset | License | Dataset Size | Clip Dimensions | Required Disk Space +--------| ------- | ------------ | --------------- | ------------------- +OpenVid | CC-BY-4.0 | 1M Clips & Captions | Various Resolutions & Aspect Ratios | 7.9TB +MiraData | GPL-3.0 | 330k Clips & Captions | `1280x720` and `1920x1080` | 29TB + + +## **Preprocessing the Datasets** +Both OpenVid and MiraData come with video clips and captions. Therefore, we can skip most of the preprocessing steps outlined in the [Open-Sora data processing guide](https://github.com/hpcaitech/Open-Sora/blob/main/docs/data_processing.md). However, we still need to add missing metadata to the CSV file for training purposes and filter out any large or unsupported files. + +### **Required Columns for Training** +To train using the Open-Sora code base, a CSV file with specific columns is required. The necessary columns are: `path`, `text`, `num_frames`, `height`, `width`, `aspect_ratio`, `fps`, `resolution`, and `file_size`. + +But thankfully, there's a script to generate most of these columns from only `path` and `text`. +If you have a CSV file (`dataset.csv`{: .filename}) containing the `path` and `text` columns, you can compute the remaining required columns from these two by executing the following command: + +```bash +python -m tools.datasets.datautil dataset.csv --info --remove-empty-caption +``` +The command will execute concurrently, generating a new file named `dataset_info_noempty.csv`{: .filename}. This file will contain all the required metadata columns and exclude any entries with empty captions. + +### **Filtering Large Video Clips** +To optimize training performance, we remove video clips larger than `50MB`, as they are more expensive to load during training. + +```bash +python -m tools.datasets.filter_large_videos dataset_info_noempty.csv 50 +``` +This results in a new file called `dataset_info_noempty_le50MB.csv`{: .filename}. + + +### **Filtering Broken and Unsupported Files** +Open-Sora uses `ffmpeg` under the hood to open files on-the-fly. +Some video clips may cause FFMPEG warnings or errors, and, in the worst case: crash the training process. To prevent this, we need to filter out files that FFMPEG cannot decode. This process is CPU-intensive, so we'll parallelize it across multiple servers. + +The idea of filtering is simple: read each file with ffmpeg, write to a file called `$filename.err` and then filter using the file size of the error file. + +> **Warning:** This filtering process can be time-consuming depending on the size of your dataset and the number of nodes available for parallel processing. +{: .prompt-tip} + +### **Steps to Filter Out Problematic Video Clips** + +1. **Create `filenames.txt` containing only all filenames:** + To extract the `path` column from `dataset_info_noempty_le50MB.csv` and save it to `filenames.txt`: + ```bash + python -c "import pandas as p, sys; p.read_csv(sys.argv[1]).path.to_csv(sys.argv[2], index=0, header=0)" dataset_info_noempty_le50MB.csv filenames.txt + ``` +2. **Split `filenames.txt` into Sub-Lists for Parallel Processing:** + Assuming you have `24` nodes available for checking, split `filenames.txt` into 24 sub-lists: + ```bash + split -n l/24 filenames.txt part_ + ``` + This will create files named `part_aa`, `part_ab`, ..., `part_az`. +3. **Adapt and Run the FFMPEG Check Script on All Nodes** + The following script will create `.err` files alongside each video file in `filenames.txt`. An empty `.err` file indicates no errors, while a non-empty file signifies an FFMPEG error with that video. + + ```bash + paste nodes.txt <(ls ./data_csvs/part_* | sort) | parallel --colsep '\t' ssh -tt {1} "bash $(pwd)/tools/datasets/ffmpeg_check_parallel.sh $(pwd)/{2}" + ``` +4. **Filter Out Files with FFMPEG Errors** + Use the following Python script to filter out video files that have FFMPEG errors: + ```python + python -m tools.datasets.ffmpeg_filter_without_errors dataset_info_noempty_le50Mb.txt + ``` + + This will generate a new file named `dataset_info_noempty_le50Mb_withouterror.txt`{: .filepath} excluding the problematic video clips, ensuring a stable training dataset. + + +## **Storing the Dataset on Shared Storage** + +After preprocessing, we make sure that all compute nodes have access to the preprocessed dataset, store it on a shared storage system accessible by all nodes. + +For the remainder of this tutorial, we'll suggest that the filtered CSVs are saved in the training repository as follows: +- the CSV for OpenVid data under `OpenVid1M.csv`{: .filepath} +- the combined CSV for OpenVid and MiraData data under `OpenVid1M-Miradata330k.csv`{: .filepath} + +> **Important:** Ensure that the shared storage is mounted and accessible from all nodes in your cluster before initiating the training process. +{: .prompt-tip} + + +
+ +--- + +**What Next?**: +By following these steps, you've successfully downloaded, preprocessed, and prepared the dataset required for training the Open-Sora model. You're now ready to proceed to the next stage: training the model on your cluster. + +Proceed to the [**Training** — Get the Ball Rolling](../training) section to begin training! + +--- diff --git a/docs/tutorial/_tabs/introduction.md b/docs/tutorial/_tabs/introduction.md new file mode 100644 index 00000000..25bee868 --- /dev/null +++ b/docs/tutorial/_tabs/introduction.md @@ -0,0 +1,77 @@ +--- +layout: post +icon: fas fa-lightbulb +title: Introduction +date: 2024-10-16 +toc: true +order: 1 +--- + +# Let's reproduce a **T2V** model! +In this tutorial, we will walk through the process of replicating [**Open-Sora 1.2**](https://github.com/hpcaitech/Open-Sora), a **1.1B parameter Text-to-Video (T2V) model** that utilizes a transformer-based diffusion architecture. Unlike Text-to-Image (T2I) models, Text-to-Video models necessitate a distinct scaling approach. This guide will cover the steps for downloading and preparing the dataset, training the model from scratch, tackling the specific challenges encountered when training at scale, and offering advice for troubleshooting distributed training jobs. + + + +## Why This Tutorial? +Recent T2I training methods can [complete training in about 2.6 days on a single 8xH100 GPU machine](https://arxiv.org/abs/2407.15811), which amounts to around 500 GPU hours. On the other hand, *T2V models* are still in their early stages, and we are yet to discover the most effective approaches. The open-source project we are replicating is a relatively small model with limited capabilities, but it needs at roughly six days of training on 192 H100 GPUs — that's about **28,000 GPU hours** — between two and three orders of magnitude longer compared to a fast T2I training scheme! + +This also implies that after a full day of training on a single 8xH100 GPU machine, we won't observe significant progress, as one day of training accounts for less than one percent of the total training time. In this scenario, **trusting the process** is essential. The goal of this tutorial is to guarantee that resources are utilized effectively by identifying and eliminating points of failure as early as possible, to emphasize the differences that arise on such a large scale, the potential problems they pose, and — most importantly — how to tackle and solve them. + + + +## Who Is This Tutorial For? +Unfortunately, this isn't a casual walkthrough that you can follow on your MacBook, but a comprehensive exploration of the challenges in scaling up Text-to-Video (T2V) machine learning models when standard waiting times become insufficient. If you're seeking a document to jumpstart your training process and wish to understand the distinctions and potential issues in large-scale training jobs, this tutorial is tailored for you. + +According to [Facebook's LLaMA 3.1 paper](https://arxiv.org/abs/2407.21783), approximately **30% of failed training jobs resulted from malfunctioning GPUs**. However, hardware issues are not the only concerns; other difficulties may emerge. We will cover best practices for identifying bugs in distributed Python code, how to approach inexplicable NCCL errors, and address data-related training obstacles. + + + +## Tutorial Overview & What You'll Learn + +### [**Lessons Learned** — Model Divergence, Cluster Debugging, NCCL Errors](../lessons) +Let's start with what problems we came across and their solutions. +- **Resolving Model Convergence Problems**: Learn how to tackle issues when your model does not converge as anticipated. +- **Debugging On a Clusters**: Discover how to utilize [py-spy](https://github.com/benfred/py-spy) for debugging cluster-wide running code. We will debug the distributed training data loader as an example. +- **Random NCCL Errors**: Obtain advice on handling the intricacies of training on a cluster. + + +### [**Setup** — Clone, Install & Setup your Cluster](../setup) +To begin training, we'll go through the following steps to set everything up: +- **Basic Setup**: + - We'll guide you through cloning and configuring the required codebase. + - Installing conda & dependencies +- **Preparing the Cluster**: + - Making sure that all nodes have access to needed files (dataset, huggingface weights, conda environments) + - Defining the nodes list for the training job + - Setting up Weights & Biases (wandb) + + +### [**Dataset** — Downloading & Preprocessing](../dataset) +Let's face it: Video data is not easily accessible, and there aren't many public sources available. +For our reproduction experiment, we will: +- Download [**OpenVid**](https://github.com/NJU-PCALab/OpenVid-1M) and [**MiraData**](https://github.com/mira-space/MiraData) datasets. +- Go through the essential steps to **preprocess the datasets**. +- Discuss how to efficiently **parallelize** preprocessing tasks using your cluster without writing any code, by leveraging Unix's built-in capabilities. + + +### [**Training** — Get the Ball Rolling](../training) +And of course, let's start training! Training on a larger scale comes with its own set of challenges. Here's what we will address: +- **Training Configurations**: We will recommend settings for a speed run (18k GPU hours) and an additional 7k GPU hours run to enhance the results. We will discuss the expectations from each setting and share intermediate and final results for our runs. +- **Starting and Monitoring Training on a Cluster**: Open-Sora is built on top of the [ColossalAI launcher](https://colossalai.org/). We'll start by simply providing the commands to get you started and how to monitor loss curves in [weights and biases](https://wandb.com). +- **Evaluating Model Quality**: Learn how to assess model performance using a separate inference server. +- **Monitoring Cluster Health**: Large-scale distributed training often faces the challenge of downtime, which can be both experienced and should be carefully tracked during the process. + + +By the end of this tutorial, you'll have a comprehensive understanding of what's involved in scaling up T2V models like Open-Sora 1.2. You'll be better equipped to handle the challenges that come with large-scale training and better prepared to troubleshoot and optimize your models effectively. +It's time to dive right in In the [next section](../lessons), we'll share insights on various challenges we faced—from finding data loader bugs that led to diverging training, to debugging issues in worker code that appeared randomly across the cluster, and tackling low-level problems with NCCL on a bare-metal setup. + + +
+ +--- + +**So Let's Get Started!** + +Before we jump into setting everything up, we'd like to share some valuable lessons we learned along the way. In the [next section](../lessons), we'll delve into the challenges we faced — from uncovering data loader bugs that caused training divergence, to debugging elusive issues in worker code that appeared randomly across the cluster, and tackling low-level problems with NCCL on a bare-metal setup. + +--- \ No newline at end of file diff --git a/docs/tutorial/_tabs/lessons.md b/docs/tutorial/_tabs/lessons.md new file mode 100644 index 00000000..2b97bd54 --- /dev/null +++ b/docs/tutorial/_tabs/lessons.md @@ -0,0 +1,155 @@ +--- +layout: post +icon: fas fa-graduation-cap +title: Lessons Learned +date: 2024-10-16 +toc: true +order: 2 +--- + +# From Training Troubleshooting to Cluster Issues + +Before diving into training the model from scratch, we wanted to share some valuable lessons we learned along the way. In this section, we'll address problems we came across that you might encounter too, and explain how to solve them. These range from model divergence issues to debugging on a cluster and tackling cluster-related configuration problems that can be challenging to debug. + +## Debugging Model Convergence Issues +> **TL;DR:** We experienced model divergence when changing datasets mid-training due to mismatched data loader states, leading to repeated batches and divergence. By fixing the data loader to match the new dataset and smoothing transitions between data chunks, we resolved the convergence issues. + +When your model isn't converging as expected, it can be frustrating. In an earlier experiment, we tried splitting our data into five smaller chunks of increasing difficulty, each to be trained for two epochs. While this approach isn't part of the main tutorial, we thought it would be helpful to explain what went wrong and how we resolved the issues. + +The loss and weight norm curves for the full 10 epochs of training looked like this: + +![Loss Curve](./assets/fails_loss.png) ![Weight Norm Curve](./assets/fails_weight_norm.png) + +As you might notice, there are three divergence points where we needed to roll back to an earlier checkpoint and restart training, even though we hadn't changed any hyperparameters. + +**Problems We Found During Training** +The key issue stemmed from how we swapped the sub-datasets. When we switched to a new data chunk, the number of batches changed. We continued training using the `--load` flag (see [Training Section](../training)), which meant that the saved data loader state did not match the new dataset. This mismatch led to some batches being shown multiple times in succession. + +Since the data loader state was assuming a dataset length different from the actual one, we encountered divergences: + +- **Blue Curve**: When the current index of the pre-calculated batches (from a different number of samples) was greater than expected, the network diverged. We fixed this in an earlier implementation and resumed training successfully (shown in the brown curve). +- **Violet Curve**: We still had an off-by-one bug in our fix for the data loader. Interestingly, this didn't lead to divergence, but we saw a sudden but slight decrease in output quality with each new epoch. The outputs became brighter, and we observed small jumps in the average weight norm of the network, happening at the last batch of each epoch. We fixed that off-by-one error and continued training again (shown in the mint curve). +- **Purple Curve**: Since we tried to increase training speed by splitting data into levels of increasing quality (using the number of nouns in the video descriptions as an indicator), we found that the last jump — from difficulty level "4 of 5" to "5 of 5" — was too drastic. This significant drift in data has led to the divergence seen in the purple curve. Removing the hardest `0.01%` of that data solved the problem, and training finished successfully. + +## Debugging on a Cluster +> **TL;DR:** Our training processes were freezing randomly without errors. Using py-spy, we discovered that garbage collection issues in PyAV within our data loader were causing the freezes. Refactoring the code to use a single PyAV instance eliminated the problem. + +When training on a cluster, you might run into issues that are hard to diagnose. Particularly data loader code is inherently highly parallel: on every node in the cluster, for every training job (one per GPU), we have multiple workers (in our case, 16 per training process/GPU) that read data to feed into the training script as efficiently as possible. +Unfortunately, we found that our training froze unexpectedly without any errors — and worse, it happened randomly every 2 to 6 hours. Checking memory usage, CPU usage, and other metrics didn't reveal any issues. + +Training on a large cluster is expensive, so while [simple solutions](https://xkcd.com/1495/) like restarting everything recurrently might seem effective in the short term, we needed a more sustainable fix since stopping and starting the cluster also incurs costs: The training startup is slow due to launching jobs on all nodes, loading checkpoint states, and any just-in-time compilation that might need to happen beforehand. + +**So, how can we debug a problem like random data loader freezes?** +We found that [py-spy](https://github.com/benfred/py-spy) was invaluable for this task. `py-spy` has several modes, and we were interested in using the `dump` feature, which prints out the current process stack trace for a given PID. + +We used the following command to run `py-spy dump` for all workers on all our machines from our head node: +```bash +{% raw %} parallel -a nodes.txt -S {} "ssh {} 'pgrep -f python.*train.py | xargs -I {{}} py-spy dump --pid {{}} > ./dumps/{{}}.out'" {% endraw %} +``` + +We then looked for unusual methods where our processes were spending a lot of time by examining active threads (most other threads in the dumps were idle). +By re-evaluating after several minutes, we saw that the same processes were still stuck on the same lines—nothing had changed. +The line standing out in the dump is the `_read_from_stream` method. + +![](./assets/pyspy_dump.png) + +Looking at the [specific line in the code](https://github.com/LambdaLabsML/Open-Sora/blob/6694aa19406b6baf6bf7348afba13ab7587c4aca/opensora/datasets/read_video.py#L211), we found the following line causing the problem. + +```python +210 result = video_frames[start_ptr:end_ptr].copy() +211 return result +``` + +It seemed to be related to garbage collection. + +Since PyAV is used under the hood, we checked their documentation and discovered their notes on [Caveats related to Garbage Collection](https://pyav.org/docs/stable/overview/caveats.html#garbage-collection), possibly causing freezes due to opening and closing instances frequently. We refactored [the code to use only a single PyAV instance](https://github.com/LambdaLabsML/Open-Sora/commit/dd967bec221bc9579094a4b529b9769612a4b84c), which solved the problem! + + +## Random NCCL Errors +> **TL;DR:** We faced frequent NCCL errors causing training crashes, traced back to clock synchronization issues across cluster nodes. Implementing proper time synchronization using Chrony resolved these errors and stabilized our training runs. + +Another issue we encountered was random crashes due to NCCL errors. These crashes happened more regularly, sometimes every 30 minutes, and also required us to manually restart training. + +The key error message was: +``` +Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. +``` + +After some investigation, we realized that the issue was caused by clock synchronization problems across the cluster nodes. One of our team members observed that some nodes' clocks were moving backwards: + +> "I noticed that one node's clock jumped backwards—the last line of `dmesg` was a few seconds ahead of the current time on that node. It appears that the polling interval is 34 minutes, and the upstream NTP server is behind this node, causing the clock to slew backwards. The freeze periodicity seems to match this interval." + +The solution was to address the clock synchronization issues by switching the NTP client to Chrony on the cluster head node. This change resolved the hangs caused by NCCL. + + +## Monitoring Model Quality + +While tracking loss curves in [Weights & Biases](https://wandb.com) provides valuable insights during training, the loss values often plateau after the initial few epochs. This makes it essential to evaluate the model beyond numerical metrics by assessing the quality of videos generated from a set of validation prompts. To do this, we need to run the most recent model weights in inference mode regularly. + +However, running inference on a Text-to-Video (T2V) model is computationally expensive. For example, generating videos at 720p resolution, 4 seconds in duration, with a batch size of 2 can takes already X minutes on a H100 — we don't want the entire cluster to idle while waiting for some nodes to finish evaluation! + +To address this, we set up a separate, smaller server to handle inference asynchronously. This allows the main training process to continue uninterrupted, maximizing our computational resources. The inference server runs the latest model checkpoints, generates sample videos, and saves the outputs to the same Weights & Biase runs that training is logging to, as you've seen in the result sections above. + + +### Setting Up the Inference Server + +> **Note:** Setting up the inference server requires that the codebase and environment are properly configured. Setting up an inference server is something we learned to be highly valuable during our training process. We recommend revisiting this section after you've followed the instructions in the (next) [Setup](../setup) section. +{: .prompt-tip} + +The inference server in this repository is designed to work asynchronously and supports several modes. Here's how you can set it up: +1. **Synchronize Checkpoints** + First, we need to synchronize the latest checkpoints from the training cluster to the inference server. If you don't have acccess to your shared storage on this dedicated inference machine, you can use `rsync` for this purpose to query the checkpoints regularly. + + ```bash + watch -n 100 rsync -avzruP --exclude='*/optimizer' training-cluster:/path/to/your/training/outputs/* . + ``` + + This command runs every 100 seconds and synchronizes new checkpoints, excluding optimizer states to save bandwidth and storage. +2. **Initialize the Node & Log In into W&B**: + Ensure that both W&B and Open-Sora are properly initialized and functioning on this node. If the node is not included in the cluster where you have previously completed the setup, please refer to the instructions provided in the [Setup](../setup) section. +3. **Run the Inference Server** + Next, we start the inference server using the desired preset and experiment numbers: + + ```bash + python scripts/inference-server.py --preset low4s --expnums 656 + ``` + + - `--preset`: Specifies the inference settings. Available options include `low4s`, `high4s`, and `low16s`, which correspond to different resolutions and durations. + - `--expnums`: Specifies the experiment numbers or checkpoint directories to monitor and execute inference on. The experiment number is used to automatically extract the W&B run-id, enabling the inference server to identify the location to push the results to. + + You can explore additional options and features by running: + + ```bash + python scripts/inference-server.py --help + ``` + + For instance, you can run a second server that computes the `720p` results. + ```bash + python scripts/inference-server.py --preset high4s --expnums 656 + ``` + +By setting up the inference server this way, we can continuously monitor the model`s output quality without interrupting the training process. This approach ensures that our valuable training resources remain focused on model optimization, while inference and evaluation happen in parallel. + + + + +## Monitoring Cluster Health +As highlighted in [the LLama 3 Paper](https://arxiv.org/abs/2407.21783), large-scale distributed training can often face downtime. We too experienced this firsthand during our training runs. +Thus, while your training is running, it's crucial to keep an eye on the health of your cluster. We use an internal tool to monitor cluster performance, regularly checking for any signs of degrading performance. This tool logs metrics such as power draw across the entire cluster and InfiniBand or Ethernet speeds. + + + +![](./assets/monitoring_tool.png) + + + +
+ +--- + +**What Next?** +With these lessons learned, you can now proceed to the [**Setup**](../setup) section to set up the codebase and begin training. + +We'll guide you through cloning the repository, installing dependencies, and configuring your cluster. You'll learn how to create a shared folder, install Miniconda on all nodes, clone the required codebase, and ensure all nodes have consistent environments and access to necessary files. Maintaining uniformity across all nodes is essential, as inconsistencies can lead to challenging bugs during the training process. + +--- \ No newline at end of file diff --git a/docs/tutorial/_tabs/setup.md b/docs/tutorial/_tabs/setup.md new file mode 100644 index 00000000..57b98af3 --- /dev/null +++ b/docs/tutorial/_tabs/setup.md @@ -0,0 +1,282 @@ +--- +layout: post +icon: fas fa-cogs +title: Setup +date: 2024-10-16 +toc: true +order: 3 +--- + +# Clone, Install & Set Up Your Cluster + +You will learn how to create a shared folder, install Miniconda on all nodes, clone the required codebase, and verify that all nodes have access to the necessary files and consistent environments - and most importantly - to ensure uniformity across all nodes, as inconsistencies can result in challenging-to-identify bugs during the training process. + +## **Using a Shared Folder** +> **Note:** Make sure to use a shared folder that is accessible by all nodes in your cluster. Alternatively, ensure that the environment is identical across the entire cluster **after installing all dependencies** to avoid potential issues. +{: .prompt-tip } + +First, it is essential to create a shared folder that can be accessed by all nodes within your cluster. This ensures uniformity and eliminates potential issues arising from discrepancies in the environments of different nodes. + +This tutorial will proceed under the assumption that you have established a shared folder accessible throughout your cluster. +Now, let's define the variable for this shared folder: +```bash +export SHAREDDIR=./shared/folder/path +``` + + +## **Compiling a List of Cluster Nodes** +To begin, create a list of all the nodes in your cluster and store it in an easily accessible location, like `~/nodes.txt`{: .filepath}. This file defines which of the nodes will be used in parallel execution and parallel training processes. + +Create a file named `nodes.txt`{: .filepath} containing the SSH host names of all nodes to be used: +```bash +node-001 +node-002 +node-003 +... +``` +(Of course, replace the placeholders `node-001`, `node-002`, and so on, with the specific hostnames or IP addresses that correspond to the nodes in your cluster.) + + +**Important:** Make sure that the starting node for training is listed as **the first entry** in the text file to avoid potential errors during the initiation of multi-node training. +{: .prompt-tip} + +**Note:** Confirm that SSH access is set up for all nodes to allow the execution of remote commands. +{: .prompt-tip} + + + +## **Installing Miniconda Across All Nodes** + +Miniconda is a minimal installer for Conda, which is an open-source package management system and environment management system. By installing Miniconda, we can efficiently manage Python environments and dependencies like CUDA and NCCL. Following the installation, we will make sure that every node has access to the identical environment. + +1. **First, we'll install Miniconda3 on a single node:** + 1. **Install Miniconda3 in the shared folder**, for example, `$SHAREDDIR/miniconda3/`. + To download the installer, use the following command: + ```bash + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $SHAREDDIR/miniconda.sh + ``` + 2. **Run the installer**. + The following installation command will install Miniconda3 in the shared directory `$SHAREDDIR/miniconda3`. + ```bash + bash $SHAREDDIR/miniconda.sh -b -p $SHAREDDIR/miniconda3 + ``` +3. **Initializing Conda on All Nodes** + Once you have installed miniconda, it is necessary to add the Conda initialization snippet to the `.bashrc` file on each node. This step guarantees that the Conda environment is correctly configured every time you log in. + + 1. **Create a file `$SHAREDDIR/conda_init.sh` using the following command:** + ```bash + cat < $SHAREDDIR/conda_init.sh + # >>> conda initialize >>> + # !! Contents within this block are managed by 'conda init' !! + __conda_setup="\$('$SHAREDDIR/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" + if [ \$? -eq 0 ]; then + eval "\$__conda_setup" + else + if [ -f "$SHAREDDIR/miniconda3/etc/profile.d/conda.sh" ]; then + . "$SHAREDDIR/miniconda3/etc/profile.d/conda.sh" + else + export PATH="$SHAREDDIR/miniconda3/bin:\$PATH" + fi + fi + unset __conda_setup + # <<< conda initialize <<< + EOF + ``` + 2. **Now, append this snippet to the `.bashrc` file on all nodes.** + You can do this efficiently by using the following command: + ```bash + parallel -a ~/nodes.txt ssh {} 'cat $SHAREDDIR/conda_init.sh >> ~/.bashrc' + ``` + This command reads the list of nodes from `~/nodes.txt` and appends `conda_init.sh` to `~/.bashrc` on each node using `cat`. + 3. **Verify the availability of the Conda environment on all nodes** + ```bash + parallel -a ~/nodes.txt ssh {} 'source ~/.bashrc && command -v conda >/dev/null 2>&1 && echo "{}: Conda is installed" || echo "{}: Conda is NOT installed"' + ``` +4. (Optional) **Ensure Only Conda's Python Libraries Are Used** + + To prevent Python from inadvertently importing packages from `~/.local/python`, it's best to disable user site-packages. This ensures that only the libraries managed by Conda are used during execution. + + **Export the `PYTHONNOUSERSITE` Environment Variable** + + Add the following line to the end of your `.bashrc` file on all nodes: + + ```bash + export PYTHONNOUSERSITE=True + ``` + + > **Important:** Setting `PYTHONNOUSERSITE=True` ensures that Python does not consider the user-specific site-packages directory (such as `~/.local/lib/pythonX.Y/site-packages`) when importing modules. This helps maintain a clean and predictable Python environment, preventing conflicts with Conda-managed packages. (Bugs are especially hard to find, if all but only a few nodes use the conda environment, while some nodes use different versions that have been installed locally.) + {: .prompt-tip} + + You can append this line to all nodes using the following command: + + ```bash + parallel -a ~/nodes.txt ssh {} 'echo "export PYTHONNOUSERSITE=True" >> ~/.bashrc' + ``` + + + +## **Cloning and Configuring the Codebase** +Next, clone the tutorial's Open-Sora fork: [https://github.com/LambdaLabsML/Open-Sora]. + +1. **Clone the Repository** + ```bash + git clone https://github.com/LambdaLabsML/Open-Sora.git + ``` +2. **Navigate to the Repository** + ```bash + cd Open-Sora + ``` +3. **Run the Installation Script** + To set up the environment and install the required dependencies, simply execute the [installation script](https://github.com/LambdaLabsML/Open-Sora/blob/main/install.sh). This script will create a Conda environment and handle the entire installation process for you. + + > **Note:** The installer comes in two versions: `install.sh`, which uses PyTorch 2.2 (the version used by Open-Sora), and `install-pytorch23.sh`, which uses PyTorch 2.3 (requiring a bit more work). We suggest using the PyTorch 2.2 version unless you encounter problems, such as issues with the NCCL version, in which case you can switch to PyTorch 2.3. + {: .prompt-tip} + + This process may take some time. Feel free to grab a coffee while you wait or read what it does [below](#installation-details)! + ```bash + yes | bash install.sh + ``` + + {: .prompt-tip} +4. **Activating the Conda Environment** + After the installation completes, activate the newly created Conda environment `osora-12`: + ```bash + conda activate osora-12 + ``` +5. **Verifying the Installation** + To confirm that everything is set up correctly, run the [installation checker](https://github.com/LambdaLabsML/Open-Sora/blob/main/install-check.py): + (Use `install-check-pytorch23`{: .filepath} in case you have chosen PyTorch2.3) + + ```bash + python install-check.py + ``` + + If everything went well, you should see the following output: + + ```shell + Starting environment check... + + Checking nvcc version... OK + Checking Python version... OK + Checking PyTorch version... OK + Checking CUDA version... OK + Checking Apex... OK + Checking Flash Attention... OK + Checking xFormers... OK + + SUCCESS: All checks passed! + ``` + + +### **Installation Details** + +**Optional Section:** The information below offers further details about the processes carried out by the installation script. If you do not wish to delve into these specifics, feel free to move on to the next section. + + +{% details Click here to read more details about the installation script. %} + +- **Creating a New Conda Environment** + The script creates a new Conda environment named `osora-12`. +- **Installing CUDA Dependencies** + CUDA dependencies are installed via the [NVIDIA channel](https://anaconda.org/nvidia/cudatoolkit), all fixed to version 12.1 to ensure compatibility. +- **Compiling and Installing NCCL** (only the PyTorch 2.3 installer) + NCCL (version 2.20.5-1) is compiled and installed. This version has been tested and works well in our setup. +- **Installing PyTorch** + PyTorch 2.2 or 2.3.1 is installed depending on the installation script used. PyTorch 2.3.1 allows dynamic linking of NCCL, enabling you to test other NCCL versions without recompiling PyTorch. +- **Installing xFormers** + [xFormers](https://github.com/facebookresearch/xformers) version 0.0.26 is installed. This package provides efficient Transformer building blocks and is compiled against PyTorch. +- **Installing FlashAttention** + [FlashAttention](https://github.com/HazyResearch/flash-attention) version 2.5.8 is installed. It's an efficient attention implementation that speeds up Transformer models, also compiled against the specific PyTorch version. +- **Installing Apex** + [Apex](https://github.com/NVIDIA/apex.git) is installed. This PyTorch extension contains tools for mixed precision and distributed training, also compiled against the specific PyTorch version. +- **Installing Other Dependencies** + - **ColossalAI** + [ColossalAI](https://github.com/hpcaitech/ColossalAI) package provides a unified interface for large-scale model training, also compiled against the specific PyTorch version. It is installed from a custom branch that works with PyTorch 2.3 if the `install-pytorch23.sh` Installer is used + - **Diffusers** + [Diffusers](https://github.com/huggingface/diffusers) is installed, which is a dependency of ColossalAI. This package offers tools for diffusion models. +- Small **Bug Fixes** + - **YAPF Package** version pinning. (Due to a bug with the Open-Sora code with a later version) + - **Protobuf Package** version pinning. (Due to a bug with the Open-Sora code with a later version) +{% enddetails %} + + +## **Preparing the Cluster** + +Ensuring that all nodes in your cluster have access to the necessary files and environments is critical for distributed training. + +> **Important:** The following steps will also download the parts of the network that are not trained from scratch, such as the pre-trained [3D VAE](https://github.com/hpcaitech/Open-Sora/blob/main/docs/report_03.md#video-compression-network) for efficient video compression and the [T5 text model](https://huggingface.co/docs/transformers/model_doc/t5). +{: .prompt-tip} + +### **Distributing Necessary Files Across Nodes** + +1. **Perform a Test Inference & Download Required Model Weights** + We'll run an inference script to (1) test if the installation is working correctly and (2) to automatically download the required pre-trained models into the Hugging Face cache directory `~/.cache/huggingface/hub`{: .filepath}. + ```bash + python scripts/inference.py configs/opensora-v1-1/inference/sample.py --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854 + ``` +2. **Copying the Hugging Face Cache to Shared Storage** + Since only the current node has the downloaded models so far, we'll copy them to a shared directory accessible by all nodes. + ```bash + cp -r ~/.cache/huggingface/hub $SHAREDDIR/opensora_hub_ckpts + ``` +3. **Distributing the Cache to All Nodes** + Now, copy the cache directory to the Hugging Face cache directory on each node. + This ensures that all nodes have access to the pre-trained models required for training and inference. + (If you don't do that and the nodes don't download the models themselves, the error messages don't explicitly tell you that weights are missing. ) + ```bash + parallel -a ~/nodes.txt ssh {} 'cp -r $SHAREDDIR/opensora_hub_ckpts/* ~/.cache/huggingface/hub/' + ``` + + +## **Initializing Weights & Biases (wandb) on All Nodes** + +[Weights & Biases](https://wandb.ai/) (wandb) is a tool for tracking experiments, visualizing metrics, and collaborating with others. We'll initialize wandb on all nodes to monitor our training process. + +### **Setting Up wandb** + +1. **Install wandb in the Conda Environment** + + If wandb is not already installed, install it in your `osora-12` environment: + + ```bash + conda activate osora-12 + pip install wandb + ``` +2. **Log In to wandb** + + You'll need to log in to wandb on each node. However, since we have a shared home directory or can execute commands on all nodes, we can automate this process. + + **Option 1: Using Shared Configuration** + If your home directory is shared across all nodes, logging in once is sufficient. + + **Option 2: Automating Login on All Nodes** + If you need to log in on each node individually, you can use the following command: + + ```bash + parallel -a ~/nodes.txt ssh {} 'wandb login YOUR_WANDB_API_KEY' + ``` + + Replace `YOUR_WANDB_API_KEY` with your actual wandb API key, which you can retrieve under the following link: [wandb.ai/authorize](https://wandb.ai/authorize). + + > **Note:** Ensure that you're securely handling your API key. Avoid exposing it in shared scripts or logs. + {: .prompt-tip} +3. **Verify wandb Initialization** + + You can verify that wandb is set up correctly on all nodes by running the following command: + ```bash + parallel -a ~/nodes.txt ssh {} 'python -c "import wandb; wandb.init(project="open-sora-test"); wandb.log({"test_metric": 1})"' + ``` + + Check your wandb dashboard to see if the test runs have been logged for all nodes. + + +
+ +--- + +**What Next?**: +By completing these steps, you've set up your cluster environment, cloned and configured the Open-Sora codebase, ensured all nodes have the necessary files and models, and initialized wandb for experiment tracking. You're now ready to proceed to the next stage: downloading and preprocessing the dataset. + +Proceed to the [**Dataset** — Downloading & Preprocessing](../dataset) section to begin working with the data required for training the Open-Sora model. + +--- \ No newline at end of file diff --git a/docs/tutorial/_tabs/training.md b/docs/tutorial/_tabs/training.md new file mode 100644 index 00000000..acca79be --- /dev/null +++ b/docs/tutorial/_tabs/training.md @@ -0,0 +1,425 @@ +--- +layout: post +icon: fas fa-play-circle +title: Training +date: 2024-10-16 +toc: true +order: 5 +--- + +> While the reading time of this section may be low, note that running the commands on this page may take a bit longer. For us the total time was `~6` days on a cluster with `192` H100 GPUs. +{: .prompt-tip} + + +# Get the Ball Rolling +In this tutorial, we'll primarily follow the multi-stage training recipe of **OpenSora 1.2**. The original approach involves three training stages, with each stage focusing on higher video resolutions than the previous one. +The original recipe involves three stages of training, with the primary distinction of an increase of resolution of the input videos used for training the model from one stage to the next. + +**Original OpenSora 1.2 training configuration**: +In detail, the training stages are defined as follows: +1. **Preparatory Stage**: This initial stage gradually adapts the T2I (Text-to-Image) model's weights from [PixArt-Σ 2K](https://pixart-alpha.github.io/PixArt-sigma-project/) to the proposed Text-to-Video (T2V) architecture named [STDiT](https://github.com/hpcaitech/Open-Sora/blob/main/docs/report_01.md#efficiency-in-choosing-the-architecture) (**S**patio-**T**emporal **Di**ffusion **T**ransformer). +2. **Stage One** ([config file](https://github.com/hpcaitech/Open-Sora/blob/main/configs/opensora-v1-2/train/stage1.py)): Focuses on `240p` and `360p` video resolutions, with video lengths ranging from 2 to 16 seconds. +3. **Stage Two** ([config file](https://github.com/hpcaitech/Open-Sora/blob/main/configs/opensora-v1-2/train/stage2.py)): Emphasizes `360p` and `480p` video resolutions. +4. **Stage Three** ([config file](https://github.com/hpcaitech/Open-Sora/blob/main/configs/opensora-v1-2/train/stage3.py)): Concentrates on `720p` and `1080p` video resolutions. + + +Unfortunately, the conversion process used to transform Pixart-Σ 2K weights to STDiT weights lacks both a config and specific commits to reproduce the original results. Although the team outlines the required model code adaptions in their [report notes](https://github.com/hpcaitech/Open-Sora/blob/476b6dc79720e5d9ddfb3cd589680b2308871926/docs/report_03.md#rectified-flow-and-model-adaptation), we will adopt a simpler approach for this tutorial at the cost of a decrease in quality. + +## Speed Run +Instead of Open-Sora's 35k H100 GPU hour training run (not counting the weight conversion training time), our approach for this tutorial involves training for approximately half that duration and skipping the preparation stage. We aim to assess the model's capabilities within this reduced budget. Subsequently, we'll invest an additional 7k GPU hours to evaluate whether the model's performance improves. We'll share the intermediate and final outcomes of our runs and examine the two configurations we've experimented with. + +### Configuration +For our training setup, we utilized a cluster of 192 H100 GPUs, aiming to closely follow Open-Sora's original approach across the three training stages. However, we made some key adjustments to adapt to our specific requirements. + +Firstly, we modified the learning rate to counter the increased batch size, which effectively increases the implicit step size in training. Observing the loss curves, we decided to also reduce the number of warmup steps from `1000` to 400, as we felt that an extensive warmup wasn't necessary. + +Additionally, we incorporated weight decay into our training, setting it to a relatively high value of 0.01 based on recommendations from [this paper](https://arxiv.org/abs/2407.15811). This adjustment was made after noticing that adding weight decay led to more significant changes when the model seemed stuck in local minima. + +Our tutorial, aiming to mimic a typical research-oriented foundation model training, involves two main parts: a base-line model and then tests to further improve the quality. +- **18k GPU hour run**: Using only small changes to the original three stages that we're confident will improve training (like weight decay adaptation). Since this base-line training is using only half of Open-Sora's training time, and we're basically training from scratch, as the weights aren't converted "correctly" from PixArt to STDiT, we don't expect great results yet here. +- **Additional 7k GPU hour run**: To further improve performance of the base model, we tested a different learning rate schedule with a small "warmup bump" to allow for more drastic changes at the start of training, and re-apply the same three-stage training recipe as before to see if training longer (and reiterating on lower resolutions again) improves training. We remove masking here. While masking increases training performance, it has an exponential negative impact on output quality, as noted in [this paper](https://arxiv.org/abs/2407.15811). We'll see if their observation holds up and the quality increases faster with masking turned off. + + +## Commands and Helpers + +### Training Command +Here's how a typical training command looks: +``` +NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 \ +NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 \ +TORCH_NCCL_ENABLE_MONITORING=1 \ +TOKENIZERS_PARALLELISM=false \ +OMP_NUM_THREADS=16 \ +colossalai run --nproc_per_node 8 \ +--hostfile nodes.txt \ +scripts/train.py configs/opensora-v1-2/lambda/stage1.py \ +--data-path OpenVid1M.csv \ +--ckpt-path pretrained_models/PixArt-Sigma-XL-2-2K-MS.pth +``` + +**Breaking Down the Command** +- **Environment Variables**: + - `NCCL_P2P_LEVEL=NVL`: Enables peer-to-peer communication over NVLink when available, which can improve communication performance between GPUs on the same node. + - `NCCL_NET_GDR_LEVEL=PIX`: Enables GPU Direct RDMA (GDR) over PCIe, enhancing inter-node communication performance, especially when using InfiniBand networks. + - `NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8`: Specifies the list of InfiniBand Host Channel Adapters (HCAs) to be used by NCCL for communication. Adjust this according to your cluster's hardware configuration. + - `NCCL_IB_PCI_RELAXED_ORDERING=1`: Enables relaxed ordering on PCIe for InfiniBand devices, which can improve performance in some scenarios. + - `NCCL_SOCKET_IFNAME=eno1`: Specifies the network interface to be used for socket-based communication. Replace `eno1` with your network interface name. + - `NCCL_DEBUG=WARN`: Sets the level of NCCL debug output. `WARN` will report warnings and errors. + - `TORCH_NCCL_ASYNC_ERROR_HANDLING=1`: Enables asynchronous error handling in NCCL when using PyTorch. This helps catch errors in NCCL operations without causing the program to hang. + - `TORCH_NCCL_ENABLE_MONITORING=1`: Enables NCCL monitoring in PyTorch for better debugging and error reporting. + - `TOKENIZERS_PARALLELISM=false`: Disables parallelism in tokenizers to prevent potential deadlocks. + - `TOKENIZERS_PARALLELISM=false`: Disables parallelism in tokenizers to prevent potential deadlocks. + - `OMP_NUM_THREADS=16`: Sets the number of OpenMP threads to use. This can improve performance by limiting the number of threads per process. You can calculate this automatically based on your system's available CPU cores. For example: + ``` + OMP_NUM_THREADS=$(( $(nproc --all) / 8 )) + ``` + This divides the total number of CPU cores by the number of GPUs per node (in this case, 8). +- **Colossal-AI Command**: + - `colossalai run`: Command to run the training script with [Colossal-AI's distributed training](https://colossalai.org/docs/concepts/colossalai_overview), essentially a wrapper around `torch.distributed` with improved parallelization and memory management. + - `--nproc_per_node 8`: Specifies the number of processes (GPUs) to run per node. Adjust this according to your cluster's configuration. + - `--hostfile `: Specifies the file containing the list of hostnames or IP addresses of the nodes in the cluster. In the [setup section](../setup) we have named the hostfile `nodes.txt`{: .filepath}. +- **Training Script Arguments**: + - `scripts/train.py`: The training script to run. + - ``: Path to the configuration file for the current training stage. + - `--data-path `: Path to the dataset CSV file. + - `--ckpt-path `: Path to the checkpoint file from which to load model weights. +- **Checkpoint Loading Comes in Three Variants**: + - `--ckpt-path`: This loads only the model weights, so effectively it resets the data loader and optimizer state. Use this when you want to start a new training stage or if you've applied changes to the dataset or model. + - `--load`: This loads all the data loader state, the learning rate scheduler state, the model weights and the optimizer state. Use this to resume training, for instance after a crash or manual stop has happened. + - `--load` with `--start-from-scratch`: This re-initializes the data loader while resuming the model, optimizer and scheduler states. Use this if you want to keep the optimizer state but have applied changes to the dataset or number of nodes. + + +### Managing Jobs on Bare Metal +When training on a bare-metal cluster, it's essential to manage jobs effectively. + +- **Check Running GPU Processes Before Starting Training**: + The repository contains a small tool, `nvtop_all.py`, that runs `nvidia-smi` all nodes contained in the hostfile. + ```bash + python nvtop_all.py + ``` + This helps you ensure that no previous jobs are still running. + + This is how its output looks like. + (Seems like an old job is still running on our cluster.) + ```bash + > python nvtop_all.py nodes.txt + Node GPU Processes Mean Power Consumption (W) + 0 ml-64-node-008 8 697.76625 + 1 ml-64-node-003 8 698.57875 + 2 ml-64-node-005 8 697.44250 + 3 ml-64-node-004 8 696.90750 + 4 ml-64-node-006 8 696.62000 + 5 ml-64-node-007 8 695.61625 + 6 ml-64-node-002 8 696.28000 + 7 ml-64-node-001 8 695.35500 + ... + ``` +- **Stop All Training Processes on the Cluster**: + To stop all training processes matching the regex `python.*train\.py`, you can use the `kill_process.sh` script: + ```bash + ./kill_process.sh + ``` + + Its output looks as follows: + ```bash + > ./kill_process.sh nodes.txt + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-001 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-002 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-003 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-004 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-005 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-006 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-007 + Sending 'sudo pkill -f python.*train\.py' to ml-64-node-008 + ... + ``` + + + +### Launching the Inference Server +To effectively monitor a diffusion model during training, it is essential inspect the model quality recurrently. We recommend assessing the model's performance on a separate machine. While typical training scripts perform evaluation concurrently with training, such as every few hundred steps, it is more practical to delegate this task to a dedicated process for Text-to-Video (T2V) models. This approach ensures that the relatively slow evaluation process does not hinder the training progress of the whole cluster. +Read the details below on how to start the inference server and log into the same Weights & Biases (W&B) run. + + + + +## 18k Hour Training +With our initial budget, we aim to replicate the core aspects of Open-Sora's training recipe at about half the training time of the original model. +Let's start training with the first part, the **18k GPU hour run**, with only minor adjustments: +**Key Changes to the Original Three Stages** +- **Dataset**: We start training with **OpenVid1M**. +- **Weight Decay**: Adapted weight decay to `0.01`. +- **Warmup Steps**: Shortened learning rate warmup to 400 steps. +- **Epochs**: We train for 5 epochs per stage (adjust the config if needed). + + +### Stage 1 +- **Config**: [`lambda/stage1.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage1.py) +- **Details**: 5 epochs, mainly on lower resolutions. + - We load PixArt Sigma weights here. Note that we didn't apply model conversion training (Stage 0), so only the spatial parts of the model are pre-trained and the temporal branches are initialized randomly. However, since the overall structure of the model is similar, we still use the pre-trained weights from the T2I model. + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/gt4gjgwm). +- **Training Command**: + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage1.py \ + --data-path OpenVid1M.csv \ + --ckpt-path pretrained_models/PixArt-Sigma-XL-2-2K-MS.pth + ``` + + +**Results** + + + +### Stage 2 +- **Config**: [`lambda/stage2.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage2.py) +- **Details**: 5 epochs, mainly on mid resolutions. + - We load the checkpoint from Stage 1 using `--ckpt-path`. + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/7xw6fx7o). +- **Training Command**: + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage2.py \ + --data-path OpenVid1M.csv \ + --ckpt-path ./outputs_speedrun/008-STDiT3-XL-2/epoch4-global_step2210/ + ``` + +**Results** + + + +### Stage 3 +- **Config**: [`lambda/stage3.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage3.py) +- **Details**: 5 epochs, mainly on higher resolutions. + - Adds the **MiraData-330k** dataset to provide a larger selection of high-resolution video clips. + - We load the checkpoint from Stage 2 using `--ckpt-path`. + - Increased the learning rate slightly, to `2e-4`. + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/mxd1zk0o). +- **Training Command**: + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage3.py \ + --data-path OpenVid1M-MiraData330k.csv \ + --ckpt-path ./outputs_speedrun/009-STDiT3-XL-2/epoch4-global_step7099 + ``` + +**Results** + + + +## Additional 7k GPU Hours +To further enhance quality, we invested an additional **7k GPU hours** and defined three more training stages (Stages 4, 5, and 6). These stages are mostly copies of the original three used above for the 18k GPU hours run. + +**Key Differences in Additional Stages** +In this phase, we decided to experiment with several key modifications to explore their impact on our results. We used both datasets, OpenVid and MiraData, simultaneously, totaling 1.3 million video clips, to provide the model with more diverse data. We adopted a different learning rate schedule with a small "warmup bump" to encourage more significant changes at the start of training. Additionally, we removed masking, as it's been shown to negatively affect output quality according to [this paper](https://arxiv.org/abs/2407.15811). By eliminating masking, we wanted to see if the trade-off between faster training times and potential improvements in output quality would benefit our application. + + +### Stage 4 +- **Config**: [`lambda/stage4.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage4.py) +- **Details**: 3 epochs, mainly on lower resolutions. + - We load the checkpoint from Stage 3 using `--ckpt-path`. + - Warm-Up using a short 1-cycle warmup "bump". + - Disabled masking + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/92dwzcpr). +- **Training Command for Stage 4** + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage4.py \ + --data-path OpenVid1M-MiraData330k.csv \ + --ckpt-path ./outputs_speedrun/010-STDiT3-XL-2/epoch4-global_step14778 + ``` + +**Results** + + + +### Stage 5 +- **Config**: [`lambda/stage5.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage5.py) +- **Details**: 1 epoch, on mid resolutions. + - We load the checkpoint from Stage 4 using `--ckpt-path`. + - Warm-Up using a short 1-cycle warmup "bump". + - Disabled masking + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/8flh231q). +- **Training Command for Stage 5** + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage5.py \ + --data-path OpenVid1M-MiraData330k.csv \ + --ckpt-path ./outputs_speedrun/012-STDiT3-XL-2/epoch3-global_step2100 + ``` + +**Results** + + + + +### Stage 6 + +- **Config**: [`lambda/stage6.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage6.py) +- **Details**: 1 epoch, on high resolutions. + - We load the checkpoint from Stage 4 using `--ckpt-path`. + - Warm-Up using a short 1-cycle warmup "bump". + - Disabled masking + - **Note**: Stage 6 crashed. For completeness, we added Stage 7 to finish training. + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/w388gmol). +- **Training Command for Stage 6** + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile nodes.txt \ + scripts/train.py configs/opensora-v1-2/lambda/stage6.py \ + --data-path OpenVid1M-MiraData330k.csv \ + --ckpt-path ./outputs_speedrun/013-STDiT3-XL-2/epoch1-global_step3300 + ``` + +**Results** + + + + +### Stage 7 +- **Config**: [`lambda/stage7.py`](https://github.com/LambdaLabsML/Open-Sora/blob/main/configs/opensora-v1-2/lambda/stage7.py) +- **Details**: 3 epochs, on high resolutions. + - Same config as Stage 6. + > Note, we could have used `--load` here as well to "continue" training of stage 6 without requiring to re-warmup. + {: .prompt-tip} + - To view the details of the W&B run, click [here](https://wandb.ai/lambdalabs/sora_speedrun/runs/5fgm44u5). +- **Training Command for Stage 7** + ```bash + NCCL_P2P_LEVEL=NVL NCCL_NET_GDR_LEVEL=PIX NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8 NCCL_IB_PCI_RELAXED_ORDERING=1 NCCL_SOCKET_IFNAME=eno1 NCCL_DEBUG=WARN TORCH_NCCL_ASYNC_ERROR_HANDLING=1 TORCH_NCCL_ENABLE_MONITORING=1 TOKENIZERS_PARALLELISM=false OMP_NUM_THREADS=16 colossalai run --nproc_per_node 8 \ + --hostfile hostfile_calvin_31 \ + scripts/train.py configs/opensora-v1-2/train/stage7.py \ + --data-path OpenVid1M-MiraData330k.csv \ + --ckpt-path ./outputs_speedrun/015-STDiT3-XL-2/epoch0-global_step2900 + ``` + +**Results** + + + + + + + + +
+ +--- + +**What Next?**: +We hope this tutorial has provided you with valuable insights into training large-scale Text-to-Video models using OpenSora 1.2. By sharing our experiences — from configuring the training environment to troubleshooting complex issues — we aim to equip you with the knowledge to navigate similar challenges in your own projects. + +Feel free to experiment with the configurations and techniques we've discussed. + +*Cheers* + + +--- + + + + \ No newline at end of file diff --git a/docs/tutorial/assets/css/colors/typography-dark.scss b/docs/tutorial/assets/css/colors/typography-dark.scss new file mode 100644 index 00000000..0b4db688 --- /dev/null +++ b/docs/tutorial/assets/css/colors/typography-dark.scss @@ -0,0 +1,150 @@ +/* + * The main dark mode styles + */ + +@mixin dark-scheme { + /* Framework color */ + --main-bg: #1b1b1e; /* Converted rgb(27, 27, 30) to hex */ + --mask-bg: #444546; /* Converted rgb(68,69,70) to hex */ + --main-border-color: #2c2d2d; /* Converted rgb(44,45,45) to hex */ + + /* Common color */ + --text-color: white; /* charcoal */ + --text-muted-color: gray; /* retained from original */ + --text-muted-highlight-color: #aeaeae; /* retained from original */ + --heading-color: #fafafa; /* charcoal */ + --label-color: #fafafa; /* charcoal */ + --blockquote-border-color: #424242; /* Converted rgb(66,66,66) to hex */ + --blockquote-text-color: #868686; /* retained from original */ + --link-color: #7030c4; /* purple (primary) */ + --link-underline-color: #7030c4; /* purple (primary) */ + --button-bg: #7030c4; /* purple (primary) */ + --btn-border-color: #7030c4; /* purple (primary) */ + --btn-backtotop-color: white; /* charcoal */ + --btn-backtotop-border-color: #212122; /* retained from original */ + --btn-box-shadow: #1b1b1e; /* --main-bg */ + --card-header-bg: #292929; /* retained from original */ + --checkbox-color: #d200b0; /* magenta (secondary) */ + --checkbox-checked-color: #7030c4; /* purple (primary) */ + --img-bg: radial-gradient(circle, #161618 0%, #202020 100%); + --shimmer-bg: linear-gradient( + 90deg, + rgba(255, 255, 255, 0) 0%, + rgba(3a, 37, 37, 0.4) 50%, /* Converted rgb(58,55,55) to hex */ + rgba(255, 255, 255, 0) 100% + ); + + /* Sidebar */ + --site-title-color: #717070; /* Converted rgb(113,113,113) to hex */ + --site-subtitle-color: #868686; /* retained from original */ + --sidebar-bg: #1e1e1e; /* retained from original */ + --sidebar-border-color: #292929; /* retained from original */ + --sidebar-muted-color: #868686; /* retained from original */ + --sidebar-active-color: #7030c4; /* purple (primary) */ + --sidebar-hover-bg: #262626; /* retained from original */ + --sidebar-btn-bg: #232328; /* retained from original */ + --sidebar-btn-color: #787878; /* retained from original */ + --avatar-border-color: #cecce6; /* Converted rgba(206,206,206,0.9) to hex with opacity if possible */ + + /* Topbar */ + --topbar-bg: rgba(27, 27, 30, 0.64); /* Retained as rgba */ + --topbar-text-color: #262626; /* charcoal */ + --search-border-color: #373737; /* Converted rgb(55,55,55) to hex */ + --search-icon-color: #646669; /* Converted rgb(100,102,105) to hex */ + --input-focus-border-color: #707273; /* Converted rgb(112,114,115) to hex */ + + /* Home page */ + --post-list-text-color: #262626; /* charcoal */ + --btn-patinator-text-color: #262626; /* charcoal */ + --btn-paginator-hover-color: #2e2e2e; /* retained from original */ + + /* Posts */ + --toc-highlight: #7030c4; /* changed from rgb(116,178,243) to purple */ + --tag-hover: #d200b0; /* magenta (secondary) */ + --tb-odd-bg: #252526; /* retained from original */ + --tb-even-bg: #1f1f22; /* Converted rgb(31,31,34) to hex */ + --tb-border-color: #252526; /* --tb-odd-bg */ + --footnote-target-bg: #3f51b5; /* Converted rgb(63,81,181) to hex */ + --btn-share-color: #6c757d; /* retained from original */ + --btn-share-hover-color: #bfc1ca; /* retained from original */ + --card-bg: #1e1e1e; /* retained from original */ + --card-hovor-bg: #464d51; /* retained from original */ + --card-shadow: rgba(15, 15, 15, 0.72) 0 6px 18px 0, + rgba(89, 87, 87, 0.24) 0 0 0 1px; /* Converted rgb(21,21,21,0.72) and rgb(137,135,135,0.24) to hex where possible */ + --kbd-wrap-color: #6a6a6a; /* retained from original */ + --kbd-text-color: #262626; /* charcoal */ + --kbd-bg-color: #242424; /* retained from original */ + --prompt-text-color: rgba(216, 212, 212, 0.75); /* retained as rgba */ + --prompt-tip-bg: rgba(112, 48, 196, 0.1); /* retained as rgba */ + --prompt-tip-icon-color: rgba(112, 48, 196, 1); /* retained as rgba */ + --prompt-info-bg: #073b68; /* Converted rgb(7,59,104,0.8) to hex */ + --prompt-info-icon-color: #0070c4; /* Changed to primary purple */ + --prompt-warning-bg: #5a4503; /* Converted rgb(90,69,3,0.88) to hex */ + --prompt-warning-icon-color: #ffa500; /* Converted rgb(255,165,0,0.8) to hex */ + --prompt-danger-bg: #5a1c08; /* Converted rgb(86,28,8,0.8) to hex */ + --prompt-danger-icon-color: #d200b0; /* Changed to secondary magenta */ + + /* Tags */ + --tag-border: #3b4f58; /* Converted rgb(59,79,88) to hex */ + --tag-shadow: #202121; /* Converted rgb(32,33,33) to hex */ + --dash-color: #3f4144; /* Converted rgb(63,65,68) to hex */ + --search-tag-bg: #292828; /* retained from original */ + + /* Categories */ + --categories-border: rgba(40, 42, 45, 0.5); /* Converted rgb(64,66,69,0.5) to hex if possible */ + --categories-hover-bg: #494b4c; /* Converted rgb(73,75,76) to hex */ + --categories-icon-hover-color: #7030c4; /* purple (primary) */ + + /* Archive */ + --timeline-node-bg: #96989c; /* Converted rgb(150,152,156) to hex */ + --timeline-color: #3f4144; /* Converted rgb(63,65,68) to hex */ + --timeline-year-dot-color: #3f4144; /* --timeline-color */ + + /* Additional Prompt Classes */ + [class^='prompt-'] { + --link-underline-color: #dbd8d8; /* Converted rgb(219,216,216) to hex */ + } + + .light { + display: none; + } + + /* Categories */ + .categories.card, + .list-group-item { + background-color: #1e1e1e; /* --card-bg */ + } + + .categories { + .card-header { + background-color: #292929; /* --card-header-bg */ + } + + .list-group-item { + border-left: none; + border-right: none; + padding-left: 2rem; + border-color: #3b4f58; /* --categories-border */ + + &:last-child { + border-bottom-color: #1e1e1e; /* --card-bg */ + } + } + } + + #archives li:nth-child(odd) { + background-image: linear-gradient( + to left, + #1a1a1e, /* Converted rgb(26,26,30) to hex */ + #27272d, /* Converted rgb(39,39,45) to hex */ + #27272d, + #27272d, + #1a1a1e + ); + } + + /* stylelint-disable-next-line selector-id-pattern */ + #disqus_thread { + color-scheme: none; + } +} /* dark-scheme */ diff --git a/docs/tutorial/assets/css/colors/typography-light.scss b/docs/tutorial/assets/css/colors/typography-light.scss new file mode 100644 index 00000000..debc2514 --- /dev/null +++ b/docs/tutorial/assets/css/colors/typography-light.scss @@ -0,0 +1,112 @@ +/* + * The syntax light mode typography colors + */ + + @mixin light-scheme { + /* Framework color */ + --main-bg: #ffffff; /* white */ + --mask-bg: #c1c3c5; + --main-border-color: #f3f3f3; + + /* Common color */ + --text-color: #262626; /* charcoal */ + --text-muted-color: #757575; /* retained from original */ + --text-muted-highlight-color: inherit; + --heading-color: #262626; /* charcoal */ + --label-color: #262626; /* charcoal */ + --blockquote-border-color: #eeeeee; + --blockquote-text-color: #757575; /* retained from original */ + --link-color: #7030c4; /* purple (primary) */ + --link-underline-color: #7030c4; /* purple (primary) */ + --button-bg: #7030c4; /* purple (primary) */ + --btn-border-color: #7030c4; /* purple (primary) */ + --btn-backtotop-color: white; /* charcoal */ + --btn-backtotop-border-color: #7030c4; /* purple (primary) */ + --btn-box-shadow: #eaeaea; /* retained from original */ + --checkbox-color: #d200b0; /* magenta (secondary) */ + --checkbox-checked-color: #d200b0; /* magenta (secondary) */ + --img-bg: radial-gradient( + circle, + #ffffff 0%, /* white */ + #efefef 100% /* light gray */ + ); + --shimmer-bg: linear-gradient( + 90deg, + rgba(250, 250, 250, 0) 0%, + rgba(232, 230, 230, 1) 50%, + rgba(250, 250, 250, 0) 100% + ); + + /* Sidebar */ + --site-title-color: #707070; /* Converted rgb(113,113,113) to hex */ + --site-subtitle-color: #717171; + --sidebar-bg: #f6f8fa; + --sidebar-border-color: #efefef; + --sidebar-muted-color: #545454; + --sidebar-active-color: #262626; /* charcoal */ + --sidebar-hover-bg: rgba(223, 233, 241, 0.64); /* Retained as rgba */ + --sidebar-btn-bg: #ffffff; /* white */ + --sidebar-btn-color: #8e8e8e; + --avatar-border-color: #ffffff; /* white */ + + /* Topbar */ + --topbar-bg: rgba(255, 255, 255, 0.7); /* Retained as rgba */ + --topbar-text-color: #4e4e4e; /* Converted rgb(78,78,78) to hex */ + --search-border-color: #f0f0f0; /* Converted rgb(240,240,240) to hex */ + --search-icon-color: #c2c6cc; + --input-focus-border-color: #b8b8b8; + + /* Home page */ + --post-list-text-color: #696969; /* dimgray to hex */ + --btn-patinator-text-color: #555555; + --btn-paginator-hover-color: #f6f8fa; /* --sidebar-bg */ + + /* Posts */ + --toc-highlight: #7030c4; /* retained as is (blue-700) */ + --btn-share-color: #808080; /* gray to hex */ + --btn-share-hover-color: #0d6efd; /* retained as is (blue) */ + --card-bg: #ffffff; /* white */ + --card-hovor-bg: #e2e2e2; + --card-shadow: rgba(104, 104, 104, 0.05) 0 2px 6px 0, + rgba(211, 209, 209, 0.15) 0 0 0 1px; + --footnote-target-bg: #e0ffff; /* lightcyan to hex */ + --tb-odd-bg: #fbfcfd; + --tb-border-color: #eaeaea; + --dash-color: #c0c0c0; /* silver to hex */ + --kbd-wrap-color: #bdbdbd; + --kbd-text-color: #262626; /* charcoal */ + --kbd-bg-color: #ffffff; /* white */ + --prompt-text-color: rgba(46, 46, 46, 0.77); /* retained as rgba */ + --prompt-tip-bg: rgba(112, 48, 196, 0.1); /* retained as rgba */ + --prompt-tip-icon-color: rgba(112, 48, 196, 1); /* retained as rgba */ + --prompt-info-bg: #e1f5fe; + --prompt-info-icon-color: #0070cb; + --prompt-warning-bg: #fff3cd; /* converted rgb(255,243,205) to hex */ + --prompt-warning-icon-color: #ef9c03; + --prompt-danger-bg: #f8d7da; /* converted rgb(248,215,218) to hex */ + --prompt-danger-icon-color: #df3c30; + + /* Tags */ + --tag-border: #dee2e6; + --tag-shadow: #e9ecef; /* --btn-border-color */ + --tag-hover: #dee2e6; /* converted rgb(222,226,230) to hex */ + --search-tag-bg: #f8f9fa; + + /* Categories */ + --categories-border: rgba(0, 0, 0, 0.125); /* retained as rgba */ + --categories-hover-bg: #e9ecef; /* --btn-border-color */ + --categories-icon-hover-color: #2f4f4f; /* darkslategray to hex */ + + /* Archive */ + --timeline-color: rgba(0, 0, 0, 0.075); /* retained as rgba */ + --timeline-node-bg: #c2c6cc; + --timeline-year-dot-color: #ffffff; /* white */ + + [class^='prompt-'] { + --link-underline-color: #dbd8d8; /* Converted rgb(219,216,216) to hex */ + } + + .dark { + display: none; + } +} /* light-scheme */ diff --git a/docs/tutorial/assets/css/jekyll-theme-chirpy.scss b/docs/tutorial/assets/css/jekyll-theme-chirpy.scss new file mode 100644 index 00000000..ed1ca336 --- /dev/null +++ b/docs/tutorial/assets/css/jekyll-theme-chirpy.scss @@ -0,0 +1,40 @@ +--- +--- + +@import 'main +{%- if jekyll.environment == 'production' -%} + .bundle +{%- endif -%} +'; + +@import 'colors/typography-dark.scss'; +@import 'colors/typography-light.scss'; + +/* append your custom style below */ +.todo { background: red ;}; + +html { + font-size: 16px; + + @media (prefers-color-scheme: light) { + &:not([data-mode]), + &[data-mode='light'] { + @include light-scheme; + } + + &[data-mode='dark'] { + @include dark-scheme; + } + } + + @media (prefers-color-scheme: dark) { + &:not([data-mode]), + &[data-mode='dark'] { + @include dark-scheme; + } + + &[data-mode='light'] { + @include light-scheme; + } + } + } \ No newline at end of file diff --git a/docs/tutorial/assets/fails_loss.png b/docs/tutorial/assets/fails_loss.png new file mode 100644 index 00000000..d4bfff5b Binary files /dev/null and b/docs/tutorial/assets/fails_loss.png differ diff --git a/docs/tutorial/assets/fails_weight_norm.png b/docs/tutorial/assets/fails_weight_norm.png new file mode 100644 index 00000000..de14195c Binary files /dev/null and b/docs/tutorial/assets/fails_weight_norm.png differ diff --git a/docs/tutorial/assets/monitoring_tool.png b/docs/tutorial/assets/monitoring_tool.png new file mode 100644 index 00000000..2c8d86b5 Binary files /dev/null and b/docs/tutorial/assets/monitoring_tool.png differ diff --git a/docs/tutorial/assets/pyspy_dump.png b/docs/tutorial/assets/pyspy_dump.png new file mode 100644 index 00000000..14db6f67 Binary files /dev/null and b/docs/tutorial/assets/pyspy_dump.png differ diff --git a/docs/tutorial/index.md b/docs/tutorial/index.md new file mode 100644 index 00000000..e19eaf45 --- /dev/null +++ b/docs/tutorial/index.md @@ -0,0 +1,13 @@ +--- +layout: post +icon: fas fa-info-circle +permalink: '/' +title: "Tutorial - Let's reproduce a T2V model." +date: 2024-10-02 +toc: true +--- + + +