diff --git a/.circleci/config.yml b/.circleci/config.yml
index 56202b7..acd6ffa 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,5 +1,18 @@
 version: 2.1
 
+executors:
+  windows-cpu:
+    machine:
+      resource_class: windows.xlarge
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
+
+  windows-gpu:
+    machine:
+      resource_class: windows.gpu.nvidia.medium
+      image: windows-server-2019-nvidia:stable
+      shell: bash.exe
+
 commands:
 
   checkout_merge:
@@ -29,48 +42,6 @@ commands:
             fi
             echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
 
-  install-conda:
-    steps:
-      - run:
-          name: Installing Conda
-          command: |
-            wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-            bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/conda
-            export PATH=$HOME/conda/bin:$PATH
-            echo 'export PATH=$HOME/conda/bin:$PATH' >> $BASH_ENV
-            conda install -y python=3
-
-  install-pytorch:
-    steps:
-      - run:
-          name: Installing PyTorch
-          command: |
-            conda install pytorch -yc pytorch-nightly
-            conda install -y numpy scipy
-
-  install-cuda:
-    steps:
-      - run:
-          name: Installing CUDA
-          command: |
-            wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda-repo-ubuntu1604-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
-            sudo dpkg -i cuda-repo-ubuntu1604-10-2-local-10.2.89-440.33.01_1.0-1_amd64.deb
-            sudo apt-key add /var/cuda-repo-10-2-local-10.2.89-440.33.01/7fa2af80.pub
-            sudo apt-get update
-            sudo apt-get -y install cuda
-
-  build-csprng:
-    steps:
-      - run:
-          name: Building CSPRNG
-          command: python setup.py install
-
-  run-tests:
-    steps:
-      - run:
-          name: Running tests
-          command: python test/test_csprng.py
-
 binary_common: &binary_common
   parameters:
     # Edit these defaults to do a release`
@@ -105,48 +76,6 @@ binary_common: &binary_common
 
 jobs:
 
-  build-nvcc:
-    machine:
-      image: ubuntu-1604:201903-01
-    resource_class: gpu.small
-    steps:
-      - checkout
-      - install-conda
-      - install-pytorch
-      - install-cuda
-      - build-csprng
-      - run-tests
-
-  build-cc:
-    machine:
-      image: ubuntu-1604:201903-01
-    resource_class: large
-    steps:
-      - checkout
-      - install-conda
-      - install-pytorch
-      - build-csprng
-      - run-tests
-
-  build-cc-pip:
-    machine:
-      image: ubuntu-1604:201903-01
-    resource_class: large
-    steps:
-      - checkout
-      - run:
-          name: Setting Python 3 and upgrade pip
-          command: |
-            pyenv local 3.7.0
-            pip install -U pip
-      - run:
-          name: Installing PyTorch
-          command: |
-            pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
-            pip install numpy scipy
-      - build-csprng
-      - run-tests
-
   binary_linux_wheel:
     <<: *binary_common
     docker:
@@ -169,7 +98,9 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout_merge
-      - run: packaging/build_conda.sh
+      - run:
+          no_output_timeout: 20m
+          command: packaging/build_conda.sh
       - store_artifacts:
           path: /opt/conda/conda-bld/linux-64
       - persist_to_workspace:
@@ -179,58 +110,63 @@ jobs:
       - store_test_results:
           path: build_results/
 
-#  binary_win_conda:
-#    <<: *binary_common
-#    executor: windows-cpu
-#    steps:
-#      - checkout_merge
-#      - run:
-#          name: Build conda packages
-#          command: |
-#            set -ex
-#            source packaging/windows/internal/vc_install_helper.sh
-#            packaging/windows/internal/cuda_install.bat
-#            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-#            conda activate base
-#            conda install -yq conda-build "conda-package-handling!=1.5.0"
-#            packaging/build_conda.sh
-#            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-#      - store_artifacts:
-#          path: C:/tools/miniconda3/conda-bld/win-64
-#      - persist_to_workspace:
-#          root: C:/tools/miniconda3/conda-bld/win-64
-#          paths:
-#            - "*"
-#      - store_test_results:
-#          path: build_results/
-#
-#  binary_win_wheel:
-#    <<: *binary_common
-#    executor: windows-cpu
-#    steps:
-#      - checkout_merge
-#      - run:
-#          name: Build wheel packages
-#          command: |
-#            set -ex
-#            source packaging/windows/internal/vc_install_helper.sh
-#            packaging/windows/internal/cuda_install.bat
-#            packaging/build_wheel.sh
-#      - store_artifacts:
-#          path: dist
-#      - persist_to_workspace:
-#          root: dist
-#          paths:
-#            - "*"
-#      - store_test_results:
-#          path: build_results/
+  binary_win_conda:
+    <<: *binary_common
+    executor: windows-cpu
+    steps:
+      - checkout_merge
+      - run:
+          name: Build conda packages
+          no_output_timeout: 20m
+          command: |
+            set -ex
+            source packaging/windows/internal/vc_install_helper.sh
+            packaging/windows/internal/cuda_install.bat
+            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
+            conda activate base
+            conda install -yq conda-build "conda-package-handling!=1.5.0"
+            packaging/build_conda.sh
+            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
+      - store_artifacts:
+          path: C:/tools/miniconda3/conda-bld/win-64
+      - persist_to_workspace:
+          root: C:/tools/miniconda3/conda-bld/win-64
+          paths:
+            - "*"
+      - store_test_results:
+          path: build_results/
+
+  binary_win_wheel:
+    <<: *binary_common
+    executor: windows-cpu
+    steps:
+      - checkout_merge
+      - run:
+          name: Build wheel packages
+          command: |
+            set -ex
+            source packaging/windows/internal/vc_install_helper.sh
+            packaging/windows/internal/cuda_install.bat
+            packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+      - persist_to_workspace:
+          root: dist
+          paths:
+            - "*"
+      - store_test_results:
+          path: build_results/
 
   binary_macos_wheel:
     <<: *binary_common
     macos:
-      xcode: "9.0"
+      xcode: "12.0"
     steps:
       - checkout_merge
+#      - run:
+#          name: Install libomp
+#          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install libomp
+#          # Disable brew auto update which is very slow
       - run:
           # Cannot easily deduplicate this as source'ing activate
           # will set environment variables which we need to propagate
@@ -250,9 +186,13 @@ jobs:
   binary_macos_conda:
     <<: *binary_common
     macos:
-      xcode: "9.0"
+      xcode: "12.0"
     steps:
       - checkout_merge
+#      - run:
+#          name: Install libomp
+#          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install libomp
+#          # Disable brew auto update which is very slow
       - run:
           command: |
             curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
@@ -308,6 +248,7 @@ jobs:
             for pkg in ~/workspace/*.whl; do
               aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
             done
+
   unittest_linux_cpu:
     <<: *binary_common
     docker:
@@ -315,101 +256,201 @@ jobs:
     resource_class: 2xlarge+
     steps:
       - checkout
-      - install-conda
-      - install-pytorch
-      - build-csprng
-      - run-tests
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
+      - restore_cache:
+
+          keys:
+            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+      - run:
+          name: Setup
+          command: .circleci/unittest/linux/scripts/setup_env.sh
+      - save_cache:
+
+          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
+      - run:
+          name: Install torchcsprng
+          command: .circleci/unittest/linux/scripts/install.sh
+      - run:
+          name: Run tests
+          command: .circleci/unittest/linux/scripts/run_test.sh
+      - run:
+          name: Post process
+          command: .circleci/unittest/linux/scripts/post_process.sh
+      - store_test_results:
+          path: test-results
 
   unittest_linux_gpu:
     <<: *binary_common
     machine:
       image: ubuntu-1604-cuda-10.1:201909-23
-    resource_class: gpu.small
+    resource_class: gpu.nvidia.small.multi
     environment:
       image_name: "pytorch/manylinux-cuda101"
+      PYTHON_VERSION: << parameters.python_version >>
     steps:
       - checkout
-      - install-conda
-      - install-pytorch
-      - install-cuda
-      - build-csprng
-      - run-tests
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
+      - restore_cache:
 
-#  unittest_windows_cpu:
-#    <<: *binary_common
-#    executor:
-#      name: windows-cpu
-#    steps:
-#      - checkout
-#      - run:
-#          name: Generate cache key
-#          # This will refresh cache on Sundays, nightly build should generate new cache.
-#          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-#      - restore_cache:
-#
-#          keys:
-#            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-#
-#      - run:
-#          name: Setup
-#          command: .circleci/unittest/windows/scripts/setup_env.sh
-#      - save_cache:
-#
-#          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-#
-#          paths:
-#            - conda
-#            - env
-#      - run:
-#          name: Install torchvision
-#          command: .circleci/unittest/windows/scripts/install.sh
-#      - run:
-#          name: Run tests
-#          command: .circleci/unittest/windows/scripts/run_test.sh
-#      - run:
-#          name: Post process
-#          command: .circleci/unittest/windows/scripts/post_process.sh
-#      - store_test_results:
-#          path: test-results
-#
-#  unittest_windows_gpu:
-#    <<: *binary_common
-#    executor:
-#      name: windows-gpu
-#    environment:
-#      CUDA_VERSION: "10.1"
-#    steps:
-#      - checkout
-#      - run:
-#          name: Generate cache key
-#          # This will refresh cache on Sundays, nightly build should generate new cache.
-#          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-#      - restore_cache:
-#
-#          keys:
-#            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-#
-#      - run:
-#          name: Setup
-#          command: .circleci/unittest/windows/scripts/setup_env.sh
-#      - save_cache:
-#
-#          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-#
-#          paths:
-#            - conda
-#            - env
-#      - run:
-#          name: Install torchvision
-#          command: .circleci/unittest/windows/scripts/install.sh
-#      - run:
-#          name: Run tests
-#          command: .circleci/unittest/windows/scripts/run_test.sh
+          keys:
+            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+      - run:
+          name: Setup
+          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
+      - save_cache:
+
+          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
+      - run:
+          name: Install torchcsprng
+          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/install.sh
+      - run:
+          name: Run tests
+          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
+      - run:
+          name: Post Process
+          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
+      - store_test_results:
+          path: test-results
+
+  unittest_windows_cpu:
+    <<: *binary_common
+    executor:
+      name: windows-cpu
+    steps:
+      - checkout
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
+      - restore_cache:
+
+          keys:
+            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+      - run:
+          name: Setup
+          command: .circleci/unittest/windows/scripts/setup_env.sh
+      - save_cache:
+
+          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
+      - run:
+          name: Install torchcsprng
+          command: .circleci/unittest/windows/scripts/install.sh
+      - run:
+          name: Run tests
+          command: .circleci/unittest/windows/scripts/run_test.sh
+      - run:
+          name: Post process
+          command: .circleci/unittest/windows/scripts/post_process.sh
+      - store_test_results:
+          path: test-results
+
+  unittest_windows_gpu:
+    <<: *binary_common
+    executor:
+      name: windows-gpu
+    environment:
+      CUDA_VERSION: "10.1"
+      PYTHON_VERSION: << parameters.python_version >>
+    steps:
+      - checkout
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
+      - restore_cache:
+
+          keys:
+            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+      - run:
+          name: Setup
+          command: .circleci/unittest/windows/scripts/setup_env.sh
+      - save_cache:
+
+          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
+      - run:
+          name: Install torchcsprng
+          command: .circleci/unittest/windows/scripts/install.sh
+      - run:
+          name: Run tests
+          command: .circleci/unittest/windows/scripts/run_test.sh
+      - run:
+          name: Post process
+          command: .circleci/unittest/windows/scripts/post_process.sh
+      - store_test_results:
+          path: test-results
+
+  unittest_macos_cpu:
+    <<: *binary_common
+    macos:
+      xcode: "12.0"
+    resource_class: large
+    steps:
+      - checkout
+      - designate_upload_channel
+      - run:
+          name: Install wget
+          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
+          # Disable brew auto update which is very slow
 #      - run:
-#          name: Post process
-#          command: .circleci/unittest/windows/scripts/post_process.sh
-#      - store_test_results:
-#          path: test-results
+#          name: Install libomp
+#          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install libomp
+#          # Disable brew auto update which is very slow
+      - run:
+          name: Generate cache key
+          # This will refresh cache on Sundays, nightly build should generate new cache.
+          command: echo "$(date +"%Y-%U")" > .circleci-weekly
+      - restore_cache:
+
+          keys:
+            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+      - run:
+          name: Setup
+          command: .circleci/unittest/linux/scripts/setup_env.sh
+      - save_cache:
+
+          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
+
+          paths:
+            - conda
+            - env
+      - run:
+          name: Install torchcsprng
+          command: .circleci/unittest/linux/scripts/install.sh
+      - run:
+          name: Run tests
+          command: .circleci/unittest/linux/scripts/run_test.sh
+      - run:
+          name: Post process
+          command: .circleci/unittest/linux/scripts/post_process.sh
+      - store_test_results:
+          path: test-results
 
 workflows:
   build:
@@ -420,11 +461,6 @@ workflows:
           name: binary_linux_wheel_py3.6_cpu
           python_version: '3.6'
           wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          cu_version: cu92
-          name: binary_linux_wheel_py3.6_cu92
-          python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_linux_wheel:
           cu_version: cu101
           name: binary_linux_wheel_py3.6_cu101
@@ -435,16 +471,16 @@ workflows:
           name: binary_linux_wheel_py3.6_cu102
           python_version: '3.6'
           wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_wheel:
+          cu_version: cu111
+          name: binary_linux_wheel_py3.6_cu111
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_linux_wheel:
           cu_version: cpu
           name: binary_linux_wheel_py3.7_cpu
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          cu_version: cu92
-          name: binary_linux_wheel_py3.7_cu92
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_linux_wheel:
           cu_version: cu101
           name: binary_linux_wheel_py3.7_cu101
@@ -455,16 +491,16 @@ workflows:
           name: binary_linux_wheel_py3.7_cu102
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_wheel:
+          cu_version: cu111
+          name: binary_linux_wheel_py3.7_cu111
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_linux_wheel:
           cu_version: cpu
           name: binary_linux_wheel_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_wheel:
-          cu_version: cu92
-          name: binary_linux_wheel_py3.8_cu92
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_linux_wheel:
           cu_version: cu101
           name: binary_linux_wheel_py3.8_cu101
@@ -475,6 +511,31 @@ workflows:
           name: binary_linux_wheel_py3.8_cu102
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_wheel:
+          cu_version: cu111
+          name: binary_linux_wheel_py3.8_cu111
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_linux_wheel:
+          cu_version: cpu
+          name: binary_linux_wheel_py3.9_cpu
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_wheel:
+          cu_version: cu101
+          name: binary_linux_wheel_py3.9_cu101
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_linux_wheel:
+          cu_version: cu102
+          name: binary_linux_wheel_py3.9_cu102
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_wheel:
+          cu_version: cu111
+          name: binary_linux_wheel_py3.9_cu111
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_macos_wheel:
           cu_version: cpu
           name: binary_macos_wheel_py3.6_cpu
@@ -490,134 +551,160 @@ workflows:
           name: binary_macos_wheel_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.6_cpu
-#          python_version: '3.6'
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.6_cu92
-#          python_version: '3.6'
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.6_cu101
-#          python_version: '3.6'
-#      - binary_win_wheel:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.6_cu102
-#          python_version: '3.6'
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.7_cpu
-#          python_version: '3.7'
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.7_cu92
-#          python_version: '3.7'
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.7_cu101
-#          python_version: '3.7'
-#      - binary_win_wheel:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.7_cu102
-#          python_version: '3.7'
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          name: binary_win_wheel_py3.8_cpu
-#          python_version: '3.8'
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.8_cu92
-#          python_version: '3.8'
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_wheel_py3.8_cu101
-#          python_version: '3.8'
-#      - binary_win_wheel:
-#          cu_version: cu102
-#          name: binary_win_wheel_py3.8_cu102
-#          python_version: '3.8'
-      - binary_linux_conda:
+      - binary_macos_wheel:
           cu_version: cpu
-          name: binary_linux_conda_py3.6_cpu
-          python_version: '3.6'
+          name: binary_macos_wheel_py3.9_cpu
+          python_version: '3.9'
           wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          cu_version: cu92
-          name: binary_linux_conda_py3.6_cu92
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.6_cpu
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda92
-      - binary_linux_conda:
+      - binary_win_wheel:
           cu_version: cu101
-          name: binary_linux_conda_py3.6_cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.6_cu101
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda101
-      - binary_linux_conda:
+      - binary_win_wheel:
           cu_version: cu102
-          name: binary_linux_conda_py3.6_cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.6_cu102
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.6_cu111
+          python_version: '3.6'
+      - binary_win_wheel:
           cu_version: cpu
-          name: binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          cu_version: cu92
-          name: binary_linux_conda_py3.7_cu92
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.7_cpu
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda92
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.7_cu101
+          python_version: '3.7'
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.7_cu102
+          python_version: '3.7'
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.7_cu111
+          python_version: '3.7'
+      - binary_win_wheel:
+          cu_version: cpu
+          name: binary_win_wheel_py3.8_cpu
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu101
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.8_cu102
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cu111
+          name: binary_win_wheel_py3.8_cu111
+          python_version: '3.8'
+      - binary_win_wheel:
+          cu_version: cpu
+          name: binary_win_wheel_py3.9_cpu
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu101
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_wheel_py3.9_cu102
+          python_version: '3.9'
+      - binary_win_wheel:
+          cu_version: cu111
+          name: binary_win_wheel_py3.9_cu111
+          python_version: '3.9'
+      - binary_linux_conda:
+          cu_version: cpu
+          name: binary_linux_conda_py3.6_cpu
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu101
+          name: binary_linux_conda_py3.6_cu101
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_linux_conda:
+          cu_version: cu102
+          name: binary_linux_conda_py3.6_cu102
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu111
+          name: binary_linux_conda_py3.6_cu111
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_linux_conda:
+          cu_version: cpu
+          name: binary_linux_conda_py3.7_cpu
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_linux_conda:
           cu_version: cu101
           name: binary_linux_conda_py3.7_cu101
@@ -628,16 +715,16 @@ workflows:
           name: binary_linux_conda_py3.7_cu102
           python_version: '3.7'
           wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu111
+          name: binary_linux_conda_py3.7_cu111
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_linux_conda:
           cu_version: cpu
           name: binary_linux_conda_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
-      - binary_linux_conda:
-          cu_version: cu92
-          name: binary_linux_conda_py3.8_cu92
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_linux_conda:
           cu_version: cu101
           name: binary_linux_conda_py3.8_cu101
@@ -648,6 +735,31 @@ workflows:
           name: binary_linux_conda_py3.8_cu102
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu111
+          name: binary_linux_conda_py3.8_cu111
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_linux_conda:
+          cu_version: cpu
+          name: binary_linux_conda_py3.9_cpu
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu101
+          name: binary_linux_conda_py3.9_cu101
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_linux_conda:
+          cu_version: cu102
+          name: binary_linux_conda_py3.9_cu102
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_linux_conda:
+          cu_version: cu111
+          name: binary_linux_conda_py3.9_cu111
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_macos_conda:
           cu_version: cpu
           name: binary_macos_conda_py3.6_cpu
@@ -663,104 +775,135 @@ workflows:
           name: binary_macos_conda_py3.8_cpu
           python_version: '3.8'
           wheel_docker_image: pytorch/manylinux-cuda102
-#      - binary_win_conda:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.6_cpu
-#          python_version: '3.6'
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.6_cu92
-#          python_version: '3.6'
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.6_cu101
-#          python_version: '3.6'
-#      - binary_win_conda:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.6_cu102
-#          python_version: '3.6'
-#      - binary_win_conda:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.7_cpu
-#          python_version: '3.7'
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.7_cu92
-#          python_version: '3.7'
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.7_cu101
-#          python_version: '3.7'
-#      - binary_win_conda:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.7_cu102
-#          python_version: '3.7'
-#      - binary_win_conda:
+#      - binary_macos_conda:
 #          cu_version: cpu
-#          name: binary_win_conda_py3.8_cpu
-#          python_version: '3.8'
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.8_cu92
-#          python_version: '3.8'
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: master
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: binary_win_conda_py3.8_cu101
-#          python_version: '3.8'
-#      - binary_win_conda:
-#          cu_version: cu102
-#          name: binary_win_conda_py3.8_cu102
-#          python_version: '3.8'
+#          name: binary_macos_conda_py3.9_cpu
+#          python_version: '3.9'
+#          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.6_cpu
+          python_version: '3.6'
+      - binary_win_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.6_cu101
+          python_version: '3.6'
+      - binary_win_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.6_cu102
+          python_version: '3.6'
+      - binary_win_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.6_cu111
+          python_version: '3.6'
+      - binary_win_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.7_cpu
+          python_version: '3.7'
+      - binary_win_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.7_cu101
+          python_version: '3.7'
+      - binary_win_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.7_cu102
+          python_version: '3.7'
+      - binary_win_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.7_cu111
+          python_version: '3.7'
+      - binary_win_conda:
+          cu_version: cpu
+          name: binary_win_conda_py3.8_cpu
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu101
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.8_cu102
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cu111
+          name: binary_win_conda_py3.8_cu111
+          python_version: '3.8'
+      - binary_win_conda:
+          cu_version: cpu
+          name: binary_win_conda_py3.9_cpu
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu101
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: master
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: binary_win_conda_py3.9_cu102
+          python_version: '3.9'
+      - binary_win_conda:
+          cu_version: cu111
+          name: binary_win_conda_py3.9_cu111
+          python_version: '3.9'
 #      - python_lint
 #      - python_type_check
 #      - clang_format
@@ -801,40 +944,68 @@ workflows:
           cu_version: cu101
           name: unittest_linux_gpu_py3.8
           python_version: '3.8'
-#      - unittest_windows_cpu:
-#          cu_version: cpu
-#          name: unittest_windows_cpu_py3.6
-#          python_version: '3.6'
-#      - unittest_windows_cpu:
-#          cu_version: cpu
-#          name: unittest_windows_cpu_py3.7
-#          python_version: '3.7'
-#      - unittest_windows_cpu:
+      - unittest_linux_gpu:
+          cu_version: cu101
+          name: unittest_linux_gpu_py3.9
+          python_version: '3.9'
+      - unittest_windows_cpu:
+          cu_version: cpu
+          name: unittest_windows_cpu_py3.6
+          python_version: '3.6'
+      - unittest_windows_cpu:
+          cu_version: cpu
+          name: unittest_windows_cpu_py3.7
+          python_version: '3.7'
+      - unittest_windows_cpu:
+          cu_version: cpu
+          name: unittest_windows_cpu_py3.8
+          python_version: '3.8'
+      - unittest_windows_cpu:
+          cu_version: cpu
+          name: unittest_windows_cpu_py3.9
+          python_version: '3.9'
+      - unittest_windows_gpu:
+          cu_version: cu101
+          filters:
+            branches:
+              only:
+                - master
+                - nightly
+          name: unittest_windows_gpu_py3.6
+          python_version: '3.6'
+      - unittest_windows_gpu:
+          cu_version: cu101
+          filters:
+            branches:
+              only:
+                - master
+                - nightly
+          name: unittest_windows_gpu_py3.7
+          python_version: '3.7'
+      - unittest_windows_gpu:
+          cu_version: cu101
+          name: unittest_windows_gpu_py3.8
+          python_version: '3.8'
+      - unittest_windows_gpu:
+          cu_version: cu101
+          name: unittest_windows_gpu_py3.9
+          python_version: '3.9'
+      - unittest_macos_cpu:
+          cu_version: cpu
+          name: unittest_macos_cpu_py3.6
+          python_version: '3.6'
+      - unittest_macos_cpu:
+          cu_version: cpu
+          name: unittest_macos_cpu_py3.7
+          python_version: '3.7'
+      - unittest_macos_cpu:
+          cu_version: cpu
+          name: unittest_macos_cpu_py3.8
+          python_version: '3.8'
+#      - unittest_macos_cpu:
 #          cu_version: cpu
-#          name: unittest_windows_cpu_py3.8
-#          python_version: '3.8'
-#      - unittest_windows_gpu:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only:
-#                - master
-#                - nightly
-#          name: unittest_windows_gpu_py3.6
-#          python_version: '3.6'
-#      - unittest_windows_gpu:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only:
-#                - master
-#                - nightly
-#          name: unittest_windows_gpu_py3.7
-#          python_version: '3.7'
-#      - unittest_windows_gpu:
-#          cu_version: cu101
-#          name: unittest_windows_gpu_py3.8
-#          python_version: '3.8'
+#          name: unittest_macos_cpu_py3.9
+#          python_version: '3.9'
   nightly:
     jobs:
 #      - circleci_consistency
@@ -862,27 +1033,6 @@ workflows:
           requires:
             - nightly_binary_linux_wheel_py3.6_cpu
           subfolder: cpu/
-      - binary_linux_wheel:
-          cu_version: cu92
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.6_cu92
-          python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda92
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.6_cu92_upload
-          requires:
-            - nightly_binary_linux_wheel_py3.6_cu92
-          subfolder: cu92/
       - binary_linux_wheel:
           cu_version: cu101
           filters:
@@ -926,15 +1076,15 @@ workflows:
             - nightly_binary_linux_wheel_py3.6_cu102
           subfolder: cu102/
       - binary_linux_wheel:
-          cu_version: cpu
+          cu_version: cu111
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
+          name: nightly_binary_linux_wheel_py3.6_cu111
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_wheel_upload:
           context: org-member
           filters:
@@ -942,20 +1092,20 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu_upload
+          name: nightly_binary_linux_wheel_py3.6_cu111_upload
           requires:
-            - nightly_binary_linux_wheel_py3.7_cpu
-          subfolder: cpu/
+            - nightly_binary_linux_wheel_py3.6_cu111
+          subfolder: cu111/
       - binary_linux_wheel:
-          cu_version: cu92
+          cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu92
+          name: nightly_binary_linux_wheel_py3.7_cpu
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda92
+          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_wheel_upload:
           context: org-member
           filters:
@@ -963,10 +1113,10 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu92_upload
+          name: nightly_binary_linux_wheel_py3.7_cpu_upload
           requires:
-            - nightly_binary_linux_wheel_py3.7_cu92
-          subfolder: cu92/
+            - nightly_binary_linux_wheel_py3.7_cpu
+          subfolder: cpu/
       - binary_linux_wheel:
           cu_version: cu101
           filters:
@@ -1010,15 +1160,15 @@ workflows:
             - nightly_binary_linux_wheel_py3.7_cu102
           subfolder: cu102/
       - binary_linux_wheel:
-          cu_version: cpu
+          cu_version: cu111
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
+          name: nightly_binary_linux_wheel_py3.7_cu111
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda111
       - binary_wheel_upload:
           context: org-member
           filters:
@@ -1026,20 +1176,20 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu_upload
+          name: nightly_binary_linux_wheel_py3.7_cu111_upload
           requires:
-            - nightly_binary_linux_wheel_py3.8_cpu
-          subfolder: cpu/
+            - nightly_binary_linux_wheel_py3.7_cu111
+          subfolder: cu111/
       - binary_linux_wheel:
-          cu_version: cu92
+          cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu92
+          name: nightly_binary_linux_wheel_py3.8_cpu
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda92
+          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_wheel_upload:
           context: org-member
           filters:
@@ -1047,10 +1197,10 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu92_upload
+          name: nightly_binary_linux_wheel_py3.8_cpu_upload
           requires:
-            - nightly_binary_linux_wheel_py3.8_cu92
-          subfolder: cu92/
+            - nightly_binary_linux_wheel_py3.8_cpu
+          subfolder: cpu/
       - binary_linux_wheel:
           cu_version: cu101
           filters:
@@ -1093,6 +1243,111 @@ workflows:
           requires:
             - nightly_binary_linux_wheel_py3.8_cu102
           subfolder: cu102/
+      - binary_linux_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.8_cu111
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.8_cu111_upload
+          requires:
+            - nightly_binary_linux_wheel_py3.8_cu111
+          subfolder: cu111/
+      - binary_linux_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cpu
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py39_cpu_upload
+          requires:
+            - nightly_binary_linux_wheel_py3.9_cpu
+          subfolder: cpu/
+      - binary_linux_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu101
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu101_upload
+          requires:
+            - nightly_binary_linux_wheel_py3.9_cu101
+          subfolder: cu101/
+      - binary_linux_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu102
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu102_upload
+          requires:
+            - nightly_binary_linux_wheel_py3.9_cu102
+          subfolder: cu102/
+      - binary_linux_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu111
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_wheel_py3.9_cu111_upload
+          requires:
+            - nightly_binary_linux_wheel_py3.9_cu111
+          subfolder: cu111/
       - binary_macos_wheel:
           cu_version: cpu
           filters:
@@ -1156,256 +1411,756 @@ workflows:
           requires:
             - nightly_binary_macos_wheel_py3.8_cpu
           subfolder: ''
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cpu
-#          python_version: '3.6'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cpu_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.6_cpu
-#          subfolder: cpu/
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu92
-#          python_version: '3.6'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu92_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.6_cu92
-#          subfolder: cu92/
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu101
-#          python_version: '3.6'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu101_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.6_cu101
-#          subfolder: cu101/
-#      - binary_win_wheel:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu102
-#          python_version: '3.6'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.6_cu102_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.6_cu102
-#          subfolder: cu102/
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cpu
-#          python_version: '3.7'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cpu_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.7_cpu
-#          subfolder: cpu/
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu92
-#          python_version: '3.7'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu92_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.7_cu92
-#          subfolder: cu92/
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu101
-#          python_version: '3.7'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu101_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.7_cu101
-#          subfolder: cu101/
-#      - binary_win_wheel:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu102
-#          python_version: '3.7'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.7_cu102_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.7_cu102
-#          subfolder: cu102/
-#      - binary_win_wheel:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cpu
-#          python_version: '3.8'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cpu_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.8_cpu
-#          subfolder: cpu/
-#      - binary_win_wheel:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu92
-#          python_version: '3.8'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu92_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.8_cu92
-#          subfolder: cu92/
-#      - binary_win_wheel:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu101
-#          python_version: '3.8'
-#      - binary_wheel_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu101_upload
-#          requires:
-#            - nightly_binary_win_wheel_py3.8_cu101
-#          subfolder: cu101/
-#      - binary_win_wheel:
-#          cu_version: cu102
+      - binary_macos_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_wheel_py3.9_cpu
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_wheel_py3.9_cpu_upload
+          requires:
+            - nightly_binary_macos_wheel_py3.9_cpu
+          subfolder: ''
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cpu
+          python_version: '3.6'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cpu_upload
+          requires:
+            - nightly_binary_win_wheel_py3.6_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu101
+          python_version: '3.6'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu101_upload
+          requires:
+            - nightly_binary_win_wheel_py3.6_cu101
+          subfolder: cu101/
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu102
+          python_version: '3.6'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu102_upload
+          requires:
+            - nightly_binary_win_wheel_py3.6_cu102
+          subfolder: cu102/
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu111
+          python_version: '3.6'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.6_cu111_upload
+          requires:
+            - nightly_binary_win_wheel_py3.6_cu111
+          subfolder: cu111/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cpu
+          python_version: '3.7'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cpu_upload
+          requires:
+            - nightly_binary_win_wheel_py3.7_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu101
+          python_version: '3.7'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu101_upload
+          requires:
+            - nightly_binary_win_wheel_py3.7_cu101
+          subfolder: cu101/
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu102
+          python_version: '3.7'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu102_upload
+          requires:
+            - nightly_binary_win_wheel_py3.7_cu102
+          subfolder: cu102/
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu111
+          python_version: '3.7'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.7_cu111_upload
+          requires:
+            - nightly_binary_win_wheel_py3.7_cu111
+          subfolder: cu111/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cpu
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cpu_upload
+          requires:
+            - nightly_binary_win_wheel_py3.8_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu101
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu101_upload
+          requires:
+            - nightly_binary_win_wheel_py3.8_cu101
+          subfolder: cu101/
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu102
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu102_upload
+          requires:
+            - nightly_binary_win_wheel_py3.8_cu102
+          subfolder: cu102/
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu111
+          python_version: '3.8'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.8_cu111_upload
+          requires:
+            - nightly_binary_win_wheel_py3.8_cu111
+          subfolder: cu111/
+      - binary_win_wheel:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cpu
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cpu_upload
+          requires:
+            - nightly_binary_win_wheel_py3.9_cpu
+          subfolder: cpu/
+      - binary_win_wheel:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu101
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu101_upload
+          requires:
+            - nightly_binary_win_wheel_py3.9_cu101
+          subfolder: cu101/
+      - binary_win_wheel:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu102
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu102_upload
+          requires:
+            - nightly_binary_win_wheel_py3.9_cu102
+          subfolder: cu102/
+      - binary_win_wheel:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu111
+          python_version: '3.9'
+      - binary_wheel_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_wheel_py3.9_cu111_upload
+          requires:
+            - nightly_binary_win_wheel_py3.9_cu111
+          subfolder: cu111/
+      - binary_linux_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cpu
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cpu_upload
+          requires:
+            - nightly_binary_linux_conda_py3.6_cpu
+      - binary_linux_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu101
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu101_upload
+          requires:
+            - nightly_binary_linux_conda_py3.6_cu101
+      - binary_linux_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu102
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu102_upload
+          requires:
+            - nightly_binary_linux_conda_py3.6_cu102
+      - binary_linux_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu111
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.6_cu111_upload
+          requires:
+            - nightly_binary_linux_conda_py3.6_cu111
+      - binary_linux_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cpu
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cpu_upload
+          requires:
+            - nightly_binary_linux_conda_py3.7_cpu
+      - binary_linux_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu101
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu101_upload
+          requires:
+            - nightly_binary_linux_conda_py3.7_cu101
+      - binary_linux_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu102
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu102_upload
+          requires:
+            - nightly_binary_linux_conda_py3.7_cu102
+      - binary_linux_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu111
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.7_cu111_upload
+          requires:
+            - nightly_binary_linux_conda_py3.7_cu111
+      - binary_linux_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cpu
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cpu_upload
+          requires:
+            - nightly_binary_linux_conda_py3.8_cpu
+      - binary_linux_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu101
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu101_upload
+          requires:
+            - nightly_binary_linux_conda_py3.8_cu101
+      - binary_linux_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu102
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu102_upload
+          requires:
+            - nightly_binary_linux_conda_py3.8_cu102
+      - binary_linux_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu111
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.8_cu111_upload
+          requires:
+            - nightly_binary_linux_conda_py3.8_cu111
+      - binary_linux_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cpu
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cpu_upload
+          requires:
+            - nightly_binary_linux_conda_py3.9_cpu
+      - binary_linux_conda:
+          cu_version: cu101
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu101
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda101
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu101_upload
+          requires:
+            - nightly_binary_linux_conda_py3.9_cu101
+      - binary_linux_conda:
+          cu_version: cu102
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu102
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu102_upload
+          requires:
+            - nightly_binary_linux_conda_py3.9_cu102
+      - binary_linux_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu111
+          python_version: '3.9'
+          wheel_docker_image: pytorch/manylinux-cuda111
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_linux_conda_py3.9_cu111_upload
+          requires:
+            - nightly_binary_linux_conda_py3.9_cu111
+      - binary_macos_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.6_cpu
+          python_version: '3.6'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.6_cpu_upload
+          requires:
+            - nightly_binary_macos_conda_py3.6_cpu
+      - binary_macos_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.7_cpu
+          python_version: '3.7'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.7_cpu_upload
+          requires:
+            - nightly_binary_macos_conda_py3.7_cpu
+      - binary_macos_conda:
+          cu_version: cpu
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.8_cpu
+          python_version: '3.8'
+          wheel_docker_image: pytorch/manylinux-cuda102
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_macos_conda_py3.8_cpu_upload
+          requires:
+            - nightly_binary_macos_conda_py3.8_cpu
+#      - binary_macos_conda:
+#          cu_version: cpu
 #          filters:
 #            branches:
 #              only: nightly
 #            tags:
 #              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu102
-#          python_version: '3.8'
-#      - binary_wheel_upload:
+#          name: nightly_binary_macos_conda_py3.9_cpu
+#          python_version: '3.9'
+#          wheel_docker_image: pytorch/manylinux-cuda102
+#      - binary_conda_upload:
 #          context: org-member
 #          filters:
 #            branches:
 #              only: nightly
 #            tags:
 #              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_wheel_py3.8_cu102_upload
+#          name: nightly_binary_macos_conda_py3.9_cpu_upload
 #          requires:
-#            - nightly_binary_win_wheel_py3.8_cu102
-#          subfolder: cu102/
-      - binary_linux_conda:
+#            - nightly_binary_macos_conda_py3.9_cpu
+      - binary_win_conda:
           cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cpu
+          name: nightly_binary_win_conda_py3.6_cpu
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1413,19 +2168,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cpu_upload
+          name: nightly_binary_win_conda_py3.6_cpu_upload
           requires:
-            - nightly_binary_linux_conda_py3.6_cpu
-      - binary_linux_conda:
-          cu_version: cu92
+            - nightly_binary_win_conda_py3.6_cpu
+      - binary_win_conda:
+          cu_version: cu101
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu92
+          name: nightly_binary_win_conda_py3.6_cu101
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1433,19 +2187,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu92_upload
+          name: nightly_binary_win_conda_py3.6_cu101_upload
           requires:
-            - nightly_binary_linux_conda_py3.6_cu92
-      - binary_linux_conda:
-          cu_version: cu101
+            - nightly_binary_win_conda_py3.6_cu101
+      - binary_win_conda:
+          cu_version: cu102
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu101
+          name: nightly_binary_win_conda_py3.6_cu102
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda101
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1453,19 +2206,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu101_upload
+          name: nightly_binary_win_conda_py3.6_cu102_upload
           requires:
-            - nightly_binary_linux_conda_py3.6_cu101
-      - binary_linux_conda:
-          cu_version: cu102
+            - nightly_binary_win_conda_py3.6_cu102
+      - binary_win_conda:
+          cu_version: cu111
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu102
+          name: nightly_binary_win_conda_py3.6_cu111
           python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1473,19 +2225,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.6_cu102_upload
+          name: nightly_binary_win_conda_py3.6_cu111_upload
           requires:
-            - nightly_binary_linux_conda_py3.6_cu102
-      - binary_linux_conda:
+            - nightly_binary_win_conda_py3.6_cu111
+      - binary_win_conda:
           cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu
+          name: nightly_binary_win_conda_py3.7_cpu
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1493,19 +2244,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu_upload
+          name: nightly_binary_win_conda_py3.7_cpu_upload
           requires:
-            - nightly_binary_linux_conda_py3.7_cpu
-      - binary_linux_conda:
-          cu_version: cu92
+            - nightly_binary_win_conda_py3.7_cpu
+      - binary_win_conda:
+          cu_version: cu101
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu92
+          name: nightly_binary_win_conda_py3.7_cu101
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1513,19 +2263,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu92_upload
+          name: nightly_binary_win_conda_py3.7_cu101_upload
           requires:
-            - nightly_binary_linux_conda_py3.7_cu92
-      - binary_linux_conda:
-          cu_version: cu101
+            - nightly_binary_win_conda_py3.7_cu101
+      - binary_win_conda:
+          cu_version: cu102
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu101
+          name: nightly_binary_win_conda_py3.7_cu102
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda101
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1533,19 +2282,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu101_upload
+          name: nightly_binary_win_conda_py3.7_cu102_upload
           requires:
-            - nightly_binary_linux_conda_py3.7_cu101
-      - binary_linux_conda:
-          cu_version: cu102
+            - nightly_binary_win_conda_py3.7_cu102
+      - binary_win_conda:
+          cu_version: cu111
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu102
+          name: nightly_binary_win_conda_py3.7_cu111
           python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1553,19 +2301,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu102_upload
+          name: nightly_binary_win_conda_py3.7_cu111_upload
           requires:
-            - nightly_binary_linux_conda_py3.7_cu102
-      - binary_linux_conda:
+            - nightly_binary_win_conda_py3.7_cu111
+      - binary_win_conda:
           cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu
+          name: nightly_binary_win_conda_py3.8_cpu
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1573,19 +2320,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu_upload
+          name: nightly_binary_win_conda_py3.8_cpu_upload
           requires:
-            - nightly_binary_linux_conda_py3.8_cpu
-      - binary_linux_conda:
-          cu_version: cu92
+            - nightly_binary_win_conda_py3.8_cpu
+      - binary_win_conda:
+          cu_version: cu101
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu92
+          name: nightly_binary_win_conda_py3.8_cu101
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda92
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1593,19 +2339,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu92_upload
+          name: nightly_binary_win_conda_py3.8_cu101_upload
           requires:
-            - nightly_binary_linux_conda_py3.8_cu92
-      - binary_linux_conda:
-          cu_version: cu101
+            - nightly_binary_win_conda_py3.8_cu101
+      - binary_win_conda:
+          cu_version: cu102
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu101
+          name: nightly_binary_win_conda_py3.8_cu102
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda101
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1613,19 +2358,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu101_upload
+          name: nightly_binary_win_conda_py3.8_cu102_upload
           requires:
-            - nightly_binary_linux_conda_py3.8_cu101
-      - binary_linux_conda:
-          cu_version: cu102
+            - nightly_binary_win_conda_py3.8_cu102
+      - binary_win_conda:
+          cu_version: cu111
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu102
+          name: nightly_binary_win_conda_py3.8_cu111
           python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1633,19 +2377,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu102_upload
+          name: nightly_binary_win_conda_py3.8_cu111_upload
           requires:
-            - nightly_binary_linux_conda_py3.8_cu102
-      - binary_macos_conda:
+            - nightly_binary_win_conda_py3.8_cu111
+      - binary_win_conda:
           cu_version: cpu
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.6_cpu
-          python_version: '3.6'
-          wheel_docker_image: pytorch/manylinux-cuda102
+          name: nightly_binary_win_conda_py3.9_cpu
+          python_version: '3.9'
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1653,19 +2396,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.6_cpu_upload
+          name: nightly_binary_win_conda_py3.9_cpu_upload
           requires:
-            - nightly_binary_macos_conda_py3.6_cpu
-      - binary_macos_conda:
-          cu_version: cpu
+            - nightly_binary_win_conda_py3.9_cpu
+      - binary_win_conda:
+          cu_version: cu101
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda102
+          name: nightly_binary_win_conda_py3.9_cu101
+          python_version: '3.9'
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1673,19 +2415,18 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu_upload
+          name: nightly_binary_win_conda_py3.9_cu101_upload
           requires:
-            - nightly_binary_macos_conda_py3.7_cpu
-      - binary_macos_conda:
-          cu_version: cpu
+            - nightly_binary_win_conda_py3.9_cu101
+      - binary_win_conda:
+          cu_version: cu102
           filters:
             branches:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda102
+          name: nightly_binary_win_conda_py3.9_cu102
+          python_version: '3.9'
       - binary_conda_upload:
           context: org-member
           filters:
@@ -1693,234 +2434,25 @@ workflows:
               only: nightly
             tags:
               only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu_upload
+          name: nightly_binary_win_conda_py3.9_cu102_upload
           requires:
-            - nightly_binary_macos_conda_py3.8_cpu
-#      - binary_win_conda:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cpu
-#          python_version: '3.6'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cpu_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.6_cpu
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu92
-#          python_version: '3.6'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu92_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.6_cu92
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu101
-#          python_version: '3.6'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu101_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.6_cu101
-#      - binary_win_conda:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu102
-#          python_version: '3.6'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.6_cu102_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.6_cu102
-#      - binary_win_conda:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cpu
-#          python_version: '3.7'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cpu_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.7_cpu
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu92
-#          python_version: '3.7'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu92_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.7_cu92
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu101
-#          python_version: '3.7'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu101_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.7_cu101
-#      - binary_win_conda:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu102
-#          python_version: '3.7'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.7_cu102_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.7_cu102
-#      - binary_win_conda:
-#          cu_version: cpu
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cpu
-#          python_version: '3.8'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cpu_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.8_cpu
-#      - binary_win_conda:
-#          cu_version: cu92
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu92
-#          python_version: '3.8'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu92_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.8_cu92
-#      - binary_win_conda:
-#          cu_version: cu101
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu101
-#          python_version: '3.8'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu101_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.8_cu101
-#      - binary_win_conda:
-#          cu_version: cu102
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu102
-#          python_version: '3.8'
-#      - binary_conda_upload:
-#          context: org-member
-#          filters:
-#            branches:
-#              only: nightly
-#            tags:
-#              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-#          name: nightly_binary_win_conda_py3.8_cu102_upload
-#          requires:
-#            - nightly_binary_win_conda_py3.8_cu102
+            - nightly_binary_win_conda_py3.9_cu102
+      - binary_win_conda:
+          cu_version: cu111
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu111
+          python_version: '3.9'
+      - binary_conda_upload:
+          context: org-member
+          filters:
+            branches:
+              only: nightly
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+          name: nightly_binary_win_conda_py3.9_cu111_upload
+          requires:
+            - nightly_binary_win_conda_py3.9_cu111
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
new file mode 100644
index 0000000..ca96279
--- /dev/null
+++ b/.circleci/unittest/linux/scripts/environment.yml
@@ -0,0 +1,15 @@
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - numpy
+  - pytest
+  - pytest-cov
+  - codecov
+  - pip
+  - ca-certificates
+  - pycrypto
+  - pip:
+      - future
+      - scipy
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
new file mode 100755
index 0000000..6334cb9
--- /dev/null
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+unset PYTORCH_VERSION
+# For unittest, nightly PyTorch is used as the following section,
+# so no need to set PYTORCH_VERSION.
+# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
+
+set -e
+
+eval "$(./conda/bin/conda shell.bash hook)"
+conda activate ./env
+
+if [ "${CU_VERSION:-}" == cpu ] ; then
+    cudatoolkit="cpuonly"
+else
+    if [[ ${#CU_VERSION} -eq 4 ]]; then
+        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
+    elif [[ ${#CU_VERSION} -eq 5 ]]; then
+        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
+    fi
+    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
+    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
+    cudatoolkit="cudatoolkit=${version}"
+fi
+printf "Installing PyTorch with %s\n" "${cudatoolkit}"
+conda install -y -c pytorch-nightly pytorch "${cudatoolkit}"
+
+printf "* Installing torchcsprng\n"
+python setup.py develop
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh
new file mode 100755
index 0000000..b05be6d
--- /dev/null
+++ b/.circleci/unittest/linux/scripts/post_process.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e
+
+eval "$(./conda/bin/conda shell.bash hook)"
+conda activate ./env
+
+codecov
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
new file mode 100755
index 0000000..61f6e3e
--- /dev/null
+++ b/.circleci/unittest/linux/scripts/run_test.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -e
+
+eval "$(./conda/bin/conda shell.bash hook)"
+conda activate ./env
+
+python -m torch.utils.collect_env
+pytest --cov=torchcsprng --junitxml=test-results/junit.xml -v --durations 20 test
\ No newline at end of file
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
new file mode 100755
index 0000000..054ebf2
--- /dev/null
+++ b/.circleci/unittest/linux/scripts/setup_env.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# This script is for setting up environment in which unit test is ran.
+# To speed up the CI time, the resulting environment is cached.
+#
+# Do not install PyTorch and torchcsprng here, otherwise they also get cached.
+
+set -e
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+root_dir="$(git rev-parse --show-toplevel)"
+conda_dir="${root_dir}/conda"
+env_dir="${root_dir}/env"
+
+cd "${root_dir}"
+
+case "$(uname -s)" in
+    Darwin*) os=MacOSX;;
+    *) os=Linux
+esac
+
+# 1. Install conda at ./conda
+if [ ! -d "${conda_dir}" ]; then
+    printf "* Installing conda\n"
+    wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
+    bash ./miniconda.sh -b -f -p "${conda_dir}"
+fi
+eval "$(${conda_dir}/bin/conda shell.bash hook)"
+
+# 2. Create test environment at ./env
+if [ ! -d "${env_dir}" ]; then
+    printf "* Creating a test environment\n"
+    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
+fi
+conda activate "${env_dir}"
+
+# 3. Install Conda dependencies
+printf "* Installing dependencies (except PyTorch)\n"
+conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
new file mode 100644
index 0000000..ca96279
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/environment.yml
@@ -0,0 +1,15 @@
+channels:
+  - pytorch
+  - conda-forge
+  - defaults
+dependencies:
+  - numpy
+  - pytest
+  - pytest-cov
+  - codecov
+  - pip
+  - ca-certificates
+  - pycrypto
+  - pip:
+      - future
+      - scipy
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
new file mode 100644
index 0000000..deba8f6
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+unset PYTORCH_VERSION
+# For unittest, nightly PyTorch is used as the following section,
+# so no need to set PYTORCH_VERSION.
+# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
+
+set -e
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
+conda activate ./env
+
+if [ "${CU_VERSION:-}" == cpu ] ; then
+    cudatoolkit="cpuonly"
+else
+    if [[ ${#CU_VERSION} -eq 4 ]]; then
+        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
+    elif [[ ${#CU_VERSION} -eq 5 ]]; then
+        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
+    fi
+    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
+    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
+    cudatoolkit="cudatoolkit=${version}"
+fi
+printf "Installing PyTorch with %s\n" "${cudatoolkit}"
+conda install -y -c pytorch-nightly pytorch "${cudatoolkit}"
+
+printf "* Installing torchcsprng\n"
+"$this_dir/vc_env_helper.bat" python setup.py develop
\ No newline at end of file
diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat
new file mode 100644
index 0000000..6612fba
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/install_conda.bat
@@ -0,0 +1 @@
+start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
\ No newline at end of file
diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh
new file mode 100644
index 0000000..2a1ac63
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/post_process.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+set -e
+
+eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
+conda activate ./env
+
+#codecov
diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh
new file mode 100644
index 0000000..02c6327
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/run_test.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+set -e
+
+eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
+conda activate ./env
+
+python -m torch.utils.collect_env
+pytest --cov=torchcsprng --junitxml=test-results/junit.xml -v --durations 20 test
\ No newline at end of file
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
new file mode 100644
index 0000000..6a73927
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/setup_env.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+# This script is for setting up environment in which unit test is ran.
+# To speed up the CI time, the resulting environment is cached.
+#
+# Do not install PyTorch and torchcsprng here, otherwise they also get cached.
+
+set -e
+
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+root_dir="$(git rev-parse --show-toplevel)"
+conda_dir="${root_dir}/conda"
+env_dir="${root_dir}/env"
+
+cd "${root_dir}"
+
+# 1. Install conda at ./conda
+if [ ! -d "${conda_dir}" ]; then
+    printf "* Installing conda\n"
+    export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
+    export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
+    curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
+    "$this_dir/install_conda.bat"
+    unset tmp_conda
+    unset miniconda_exe
+fi
+
+eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')"
+
+# 2. Create test environment at ./env
+if [ ! -d "${env_dir}" ]; then
+    printf "* Creating a test environment\n"
+    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
+fi
+conda activate "${env_dir}"
+
+# 3. Install Conda dependencies
+printf "* Installing dependencies (except PyTorch)\n"
+conda env update --file "${this_dir}/environment.yml" --prune
\ No newline at end of file
diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat
new file mode 100644
index 0000000..9410135
--- /dev/null
+++ b/.circleci/unittest/windows/scripts/vc_env_helper.bat
@@ -0,0 +1,39 @@
+@echo on
+
+set VC_VERSION_LOWER=16
+set VC_VERSION_UPPER=17
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+if "%VSDEVCMD_ARGS%" == "" (
+    call "%VS15VCVARSALL%" x64 || exit /b 1
+) else (
+    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
+)
+
+@echo on
+
+set DISTUTILS_USE_SDK=1
+
+set args=%1
+shift
+:start
+if [%1] == [] goto done
+set args=%args% %1
+shift
+goto start
+
+:done
+if "%args%" == "" (
+    echo Usage: vc_env_helper.bat [command] [args]
+    echo e.g. vc_env_helper.bat cl /c test.cpp
+)
+
+%args% || exit /b 1
diff --git a/.gitignore b/.gitignore
index 4404a2b..ee0c254 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
+.idea
+.vscode
+# do not change or delete this comment - `python setup.py clean` deletes everything after this line
 dist/
 build/
 *.egg-info/
-.idea
-.vscode
-torch_csprng/version.py
+torchcsprng/version.py
 */__pycache__
+.pytest_cache
 *.so
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..b91e23b
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <conduct@pytorch.org>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..fdc1528
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to csprng
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to csprng, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
\ No newline at end of file
diff --git a/README.md b/README.md
index 792c546..128e327 100644
--- a/README.md
+++ b/README.md
@@ -1,33 +1,54 @@
 # PyTorch/CSPRNG
 
-CSPRNG is a [PyTorch C++/CUDA extension](https://pytorch.org/tutorials/advanced/cpp_extension.html) that provides [cryptographically secure pseudorandom number generators](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) for PyTorch.
-
 [![CircleCI](https://circleci.com/gh/pytorch/csprng.svg?style=shield&circle-token=64701692dd7f13f31019612289f0200fdb661dc2)](https://circleci.com/gh/pytorch/csprng)
 
+torchcsprng is a [PyTorch C++/CUDA extension](https://pytorch.org/tutorials/advanced/cpp_extension.html) that provides:
+
+- [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) 128-bit encryption/decryption in two modes: [ECB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) and [CTR](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR)) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/csprng/blob/master/examples/encrypt_decrypt.ipynb)
+- [cryptographically secure pseudorandom number generators](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) for PyTorch. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/csprng/blob/master/examples/csprng.ipynb)
+
 ## Design
 
-CSPRNG generates a random 128-bits key on CPU using one of its generators and runs
+torchcsprng generates a random 128-bit key on CPU using one of its generators and runs
 [AES128](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) in [CTR mode](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))
-mode either on CPU or on GPU using CUDA to generate random 128 bits state and apply transformation function to map it to target tensor values.
+ either on CPU or on GPU using CUDA to generate a random 128 bit state and apply a transformation function to map it to target tensor values.
 This approach is based on [Parallel Random Numbers: As Easy as 1, 2, 3(John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, D. E. Shaw Research)](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf).
-It makes CSPRNG both crypto-secure and parallel on CUDA and CPU.
+It makes torchcsprng both crypto-secure and parallel on CUDA and CPU.
 
 ![CSPRNG architecture](.github/csprng_architecture.png)
 
 Advantages:
 
-- A user can choose either seed-based(for testing) or random device based(fully crypto-secure) generators
+- The user can choose either seed-based(for testing) or random device based(fully crypto-secure) generators
 - One generator instance for both CPU and CUDA tensors(because the encryption key is always generated on CPU)
-- CPU random number generation is also parallel(unlike default PyTorch CPU generator)
+- CPU random number generation is also parallel(unlike the default PyTorch CPU generator)
 
 ## Features
 
-CSPRNG exposes two methods to create crypto-secure and non-crypto-secure PRNGs:
+torchcsprng 0.2.0 exposes new API for tensor encryption/decryption. Tensor encryption/decryption API is dtype agnostic, so a tensor of any dtype can be encrypted and the result can be stored to a tensor of any dtype. An encryption key also can be a tensor of any dtype. Currently torchcsprng supports [AES](https://en.wikipedia.org/wiki/Advanced_Encryption_Standard) cipher with 128-bit key in two modes: [ECB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) and [CTR](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR)).
+
+* `torchcsprng.encrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`
+
+> - `input` tensor can be any CPU or CUDA tensor of any dtype and size in bytes(zero-padding is used to make its size in bytes divisible by block size in bytes)
+> - `output` tensor can have any dtype and the same device as `input` tensor and the size in bytes rounded up to the block size in bytes(16 bytes for AES 128)
+> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
+> - `cipher` currently can be only one supported value `"aes128"`
+> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))
+
+* `torchcsprng.decrypt(input: Tensor, output: Tensor, key: Tensor, cipher: string, mode: string)`
+
+> - `input` tensor can be any CPU or CUDA tensor of any dtype with size in bytes divisible by the block size in bytes(16 bytes for AES 128)
+> - `output` tensor can have any dtype but the same device as `input` tensor and the same size in bytes as `input` tensor
+> - `key` tensor can have any dtype and the same device as `input` tensor and size in bytes equal to 16 for AES 128
+> - `cipher` currently can be only one supported value `"aes128"`
+> - `mode` currently can be either [`"ecb"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) or [`"ctr"`](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR))
+
+torchcsprng exposes two methods to create crypto-secure and non-crypto-secure PRNGs:
 
-| Method to create PRNG                              | Is crypto-secure? | Has seed? | Underlying implementation                                                                                                          |
-|----------------------------------------------------|-------------------|-----------|------------------------------------------------------------------------------------------------------------------------------------|
-| create_random_device_generator(token: string=None) |         yes       |    no     | See [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device) and [it's constructor](https://en.cppreference.com/w/cpp/numeric/random/random_device/random_device). The implementation in libstdc++ expects token to name the source of random bytes. Possible token values include "default", "rand_s", "rdseed", "rdrand", "rdrnd", "/dev/urandom", "/dev/random", "mt19937", and integer string specifying the seed of the mt19937 engine. (Token values other than "default" are only valid for certain targets.) If token=None then constructs a new std::random_device object with an implementation-defined token. |
-| create_mt19937_generator(seed: int=None)           |         no        |    yes    | See [std::mt19937](https://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine) and [it's constructor](https://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine/mersenne_twister_engine). Constructs a mersenne_twister_engine object, and initializes its internal state sequence to pseudo-random values. If seed=None then seeds the engine with default_seed.|
+| Method to create PRNG                              | Is crypto-secure? | Has seed? | Underlying implementation |
+|----------------------------------------------------|-------------------|-----------|---------------------------|
+| create_random_device_generator(token: string=None) |         yes       |    no     | See [std::random_device](https://en.cppreference.com/w/cpp/numeric/random/random_device) and [its constructor](https://en.cppreference.com/w/cpp/numeric/random/random_device/random_device). The implementation in libstdc++ expects token to name the source of random bytes. Possible token values include "default", "rand_s", "rdseed", "rdrand", "rdrnd", "/dev/urandom", "/dev/random", "mt19937", and integer string specifying the seed of the mt19937 engine. (Token values other than "default" are only valid for certain targets.) If token=None then constructs a new std::random_device object with an implementation-defined token. |
+| create_mt19937_generator(seed: int=None)           |         no        |    yes    | See [std::mt19937](https://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine) and [its constructor](https://en.cppreference.com/w/cpp/numeric/random/mersenne_twister_engine/mersenne_twister_engine). Constructs a mersenne_twister_engine object, and initializes its internal state sequence to pseudo-random values. If seed=None then seeds the engine with default_seed.|
 
 The following list of methods supports all forementioned PRNGs:
 
@@ -42,68 +63,151 @@ The following list of methods supports all forementioned PRNGs:
 | log_normal_(mean, std) | yes  | yes |
 | geometric_(p)          | yes  | yes |
 | exponential_(lambda)   | yes  | yes |
+| randperm(n)            | yes* | yes |
+
+* the calculations are done on CPU and the result is copied to CUDA
+
+## Installation
+
+CSPRNG works with Python 3.6-3.9 on the following operating systems and can be used with PyTorch tensors on the following devices:
+
+| Tensor Device Type | Linux     | macOS         | MS Window      |
+|--------------------|-----------|---------------|----------------| 
+| CPU                | Supported | Supported     | Supported      |
+| CUDA               | Supported | Not Supported | Supported since 0.2.0 |
+
+The following is the corresponding CSPRNG versions and supported Python versions.
+
+| PyTorch | CSPRNG | Python   | CUDA             |
+|---------|--------|----------|------------------|
+| 1.8.0   | 0.2.0  | 3.7-3.9  | 10.1, 10.2, 11.1 |
+| 1.7.1   | 0.1.4  | 3.6-3.8  | 9.2, 10.1, 10.2  |
+| 1.7.0   | 0.1.3  | 3.6-3.8  | 9.2, 10.1, 10.2  |
+| 1.6.0   | 0.1.2  | 3.6-3.8  | 9.2, 10.1, 10.2  |
+
+
+### Binary Installation
 
-## How to build
+Anaconda:
 
-Since CSPRNG is C++/CUDA extension it uses setuptools, just run `python setup.py install` to build and install it.
+| OS            | CUDA                                           |                                                                                                                                                                                                                                                                                                       |
+|---------------|------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Linux/Windows | 10.1<br/><br/>10.2<br/><br/>11.1<br/><br/>None | conda install torchcsprng cudatoolkit=10.1 -c pytorch -c conda-forge<br/><br/>conda install torchcsprng cudatoolkit=10.2 -c pytorch -c conda-forge<br/><br/>conda install torchcsprng cudatoolkit=11.1 -c pytorch -c conda-forge<br/><br/>conda install torchcsprng cpuonly -c pytorch -c conda-forge |
+| macOS         | None                                           | conda install torchcsprng -c pytorch                                                                                                                                                                                                                                                          |
 
-## How to use
+pip:
 
+| OS            | CUDA                                           |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|---------------|------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Linux/Windows | 10.1<br/><br/>10.2<br/><br/>11.1<br/><br/>None | pip install torchcsprng==0.2.0+cu101 torch==1.8.0+cu101 -f https://download.pytorch.org/whl/cu101/torch_stable.html <br/><br/>pip install torchcsprng==0.2.0 torch==1.8.0 -f https://download.pytorch.org/whl/cu102/torch_stable.html <br/><br/>pip install torchcsprng==0.2.0+cu111 torch==1.8.0+cu111 -f https://download.pytorch.org/whl/cu111/torch_stable.html <br/><br/>pip install torchcsprng==0.2.0+cpu torch==1.8.0+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html |
+| macOS         | None                                           | pip install torchcsprng torch                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
+
+### Nightly builds:
+
+Anaconda:
+
+| OS            | CUDA                                           |                                                                                                                                                                                                                                                                                                                                       |
+|---------------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Linux/Windows | 10.1<br/><br/>10.2<br/><br/>11.1<br/><br/>None | conda install torchcsprng cudatoolkit=10.1 -c pytorch-nightly -c conda-forge<br/><br/>conda install torchcsprng cudatoolkit=10.2 -c pytorch-nightly -c conda-forge<br/><br/>conda install torchcsprng cudatoolkit=11.1 -c pytorch-nightly -c conda-forge<br/><br/>conda install torchcsprng cpuonly -c pytorch-nightly -c conda-forge |
+| macOS         | None                                           | conda install torchcsprng -c pytorch-nightly                                                                                                                                                                                                                                                                                          |
+
+pip:
+
+| OS            | CUDA                                           |                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|---------------|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Linux/Windows | 10.1<br/><br/>10.2<br/><br/>11.1<br/><br/>None | pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html <br/><br/> pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html <br/><br/> pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html <br/><br/> pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html |
+| macOS         | None                                           | pip install --pre torchcsprng -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html                                                                                                                                                                                                                                                                                                                                           |
+
+### From Source
+
+torchcsprng is a Python C++/CUDA extension that depends on PyTorch. In order to build CSPRNG from source it is required to have Python(>=3.7) with PyTorch(>=1.8.0) installed and C++ compiler(gcc/clang for Linux, XCode for macOS, Visual Studio for MS Windows).
+To build torchcsprng you can run the following:
+```console
+python setup.py install
+```
+By default, GPU support is built if CUDA is found and torch.cuda.is_available() is True. Additionally, it is possible to force building GPU support by setting the FORCE_CUDA=1 environment variable, which is useful when building a docker image.
+
+## Getting Started
+
+The torchcsprng API is available in `torchcsprng` module:
 ```python
 import torch
-import torch_csprng as csprng
-
-# Create crypto-secure PRNG from /dev/urandom:
+import torchcsprng as csprng
+```
+Create crypto-secure PRNG from /dev/urandom:
+```python
 urandom_gen = csprng.create_random_device_generator('/dev/urandom')
+```
 
-# Create empty boolean tensor on CUDA and initialize it with random values from urandom_gen:
-print(torch.empty(10, dtype=torch.bool, device='cuda').random_(generator=urandom_gen))
+Create empty boolean tensor on CUDA and initialize it with random values from urandom_gen:
+```python
+torch.empty(10, dtype=torch.bool, device='cuda').random_(generator=urandom_gen)
+```
+```
 tensor([ True, False, False,  True, False, False, False,  True, False, False],
        device='cuda:0')
+```
 
-# Create empty int16 tensor on CUDA and initialize it with random values in range [0, 100) from urandom_gen:
-print(torch.empty(10, dtype=torch.int16, device='cuda').random_(100, generator=urandom_gen))
+Create empty int16 tensor on CUDA and initialize it with random values in range [0, 100) from urandom_gen:
+```python
+torch.empty(10, dtype=torch.int16, device='cuda').random_(100, generator=urandom_gen)
+```
+```
 tensor([59, 20, 68, 51, 18, 37,  7, 54, 74, 85], device='cuda:0',
        dtype=torch.int16)
+```
 
-# Create non-crypto-secure MT19937 PRNG:
+Create non-crypto-secure MT19937 PRNG:
+```python
 mt19937_gen = csprng.create_mt19937_generator()
-
-print(torch.empty(10, dtype=torch.int64, device='cuda').random_(torch.iinfo(torch.int64).min, to=None, generator=mt19937_gen))
+torch.empty(10, dtype=torch.int64, device='cuda').random_(torch.iinfo(torch.int64).min, to=None, generator=mt19937_gen)
+```
+```
 tensor([-7584783661268263470,  2477984957619728163, -3472586837228887516,
         -5174704429717287072,  4125764479102447192, -4763846282056057972,
          -182922600982469112,  -498242863868415842,   728545841957750221,
          7740902737283645074], device='cuda:0')
+```
 
-# Create crypto-secure PRNG from default random device:
+Create crypto-secure PRNG from default random device:
+```python
 default_device_gen = csprng.create_random_device_generator()
-
-print(torch.randn(10, device='cuda', generator=default_device_gen))
+torch.randn(10, device='cuda', generator=default_device_gen)
+```
+```
 tensor([ 1.2885,  0.3240, -1.1813,  0.8629,  0.5714,  2.3720, -0.5627, -0.5551,
         -0.6304,  0.1090], device='cuda:0')
+```
 
-# Create non-crypto-secure MT19937 PRNG with seed
+Create non-crypto-secure MT19937 PRNG with seed:
+```python
 mt19937_gen = csprng.create_mt19937_generator(42)
+torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen)
+```
+```
+tensor([ 7.,  1.,  8.,  1., 11.,  3.,  1.,  1.,  5., 10.], device='cuda:0')
+```
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
+Recreate MT19937 PRNG with the same seed:
+```python
+mt19937_gen = csprng.create_mt19937_generator(42)
+torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen)
+```
+```
 tensor([ 7.,  1.,  8.,  1., 11.,  3.,  1.,  1.,  5., 10.], device='cuda:0')
+```
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
-tensor([ 1.,  1.,  1.,  6.,  1., 13.,  5.,  1.,  3.,  4.], device='cuda:0')
+## Contributing
+We appreciate all contributions. If you are planning to contribute back bug-fixes, please do so without any further discussion. If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
-tensor([14.,  5.,  4.,  5.,  1.,  1.,  8.,  1.,  7., 10.], device='cuda:0')
 
-# Recreate MT19937 PRNG with the same seed
-mt19937_gen = csprng.create_mt19937_generator(42)
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
-tensor([ 7.,  1.,  8.,  1., 11.,  3.,  1.,  1.,  5., 10.], device='cuda:0')
+## License
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
-tensor([ 1.,  1.,  1.,  6.,  1., 13.,  5.,  1.,  3.,  4.], device='cuda:0')
+torchcsprng is BSD 3-clause licensed. See the license file [here](https://github.com/pytorch/csprng/blob/master/LICENSE)
 
-print(torch.empty(10, device='cuda').geometric_(p=0.2, generator=mt19937_gen))
-tensor([14.,  5.,  4.,  5.,  1.,  1.,  8.,  1.,  7., 10.], device='cuda:0')
+## [Terms of Use](https://opensource.facebook.com/legal/terms)
 
-```
+## [Privacy Policy](https://opensource.facebook.com/legal/privacy)
+
+Copyright © 2020 Meta Platforms, Inc
diff --git a/examples/csprng.ipynb b/examples/csprng.ipynb
new file mode 100644
index 0000000..1f6b477
--- /dev/null
+++ b/examples/csprng.ipynb
@@ -0,0 +1,226 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "csprng.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Lpno_zUJT8ms"
+      },
+      "source": [
+        "# Cryptographically secure pseudorandom number generators for PyTorch\n",
+        "\n",
+        "The torchcsprng API is available in `torchcsprng` module:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "db4YYky-PDI_"
+      },
+      "source": [
+        "!pip install torchcsprng==0.2.0 torch==1.8.0 -f https://download.pytorch.org/whl/cu101/torch_stable.html"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "O1s_j8CPPHSn"
+      },
+      "source": [
+        "import torch\n",
+        "import torchcsprng as csprng"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "o1Kz25IoS9m-"
+      },
+      "source": [
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HLlLxkDIUWCG"
+      },
+      "source": [
+        "Create crypto-secure PRNG from /dev/urandom:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yyyYlq5kUQss"
+      },
+      "source": [
+        "urandom_gen = csprng.create_random_device_generator('/dev/urandom')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xbUCnJfkUdUI"
+      },
+      "source": [
+        "Create empty boolean tensor on the `device` and initialize it with random values from `urandom_gen`:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zmj_VlIzUYIO"
+      },
+      "source": [
+        "torch.empty(10, dtype=torch.bool, device=device).random_(generator=urandom_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ycODsYhtUud9"
+      },
+      "source": [
+        "Create empty int16 tensor on the `device` and initialize it with random values in range [0, 100) from `urandom_gen`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "uel-jbW9UlZH"
+      },
+      "source": [
+        "torch.empty(10, dtype=torch.int16, device=device).random_(100, generator=urandom_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1jXW1FEmVMW_"
+      },
+      "source": [
+        "Create non-crypto-secure MT19937 PRNG:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "sL-cwFGfVOrp"
+      },
+      "source": [
+        "mt19937_gen = csprng.create_mt19937_generator()\n",
+        "torch.empty(10, dtype=torch.int64, device=device).random_(torch.iinfo(torch.int64).min, to=None, generator=mt19937_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "KW96wT4UVXBm"
+      },
+      "source": [
+        "Create crypto-secure PRNG from default random device:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "tjwbuE6FVRgm"
+      },
+      "source": [
+        "default_device_gen = csprng.create_random_device_generator()\n",
+        "torch.randn(10, device=device, generator=default_device_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qYgdkZAYVfZT"
+      },
+      "source": [
+        "Create non-crypto-secure MT19937 PRNG with seed:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xjOsYOxxVbzg"
+      },
+      "source": [
+        "mt19937_gen = csprng.create_mt19937_generator(42)\n",
+        "first = torch.empty(10, device=device).geometric_(p=0.2, generator=mt19937_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cV77v7tHVlRd"
+      },
+      "source": [
+        "Recreate MT19937 PRNG with the same seed:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "i0O2lC0hVjAg"
+      },
+      "source": [
+        "mt19937_gen = csprng.create_mt19937_generator(42)\n",
+        "second = torch.empty(10, device=device).geometric_(p=0.2, generator=mt19937_gen)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OcgSK0mejcef"
+      },
+      "source": [
+        "Check that `first` equals to `second`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vMx1BRO3jh7L"
+      },
+      "source": [
+        "assert (first == second).all()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/examples/encrypt_decrypt.ipynb b/examples/encrypt_decrypt.ipynb
new file mode 100644
index 0000000..3de8968
--- /dev/null
+++ b/examples/encrypt_decrypt.ipynb
@@ -0,0 +1,307 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "encrypt_decrypt.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4JG-7IJgz_dK"
+      },
+      "source": [
+        "# PyTorch/CSPRNG encrypt/decrypt examples"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "H8TZemj30JvQ"
+      },
+      "source": [
+        "torchcsprng 0.2.0 exposes new API for tensor encryption/decryption. Tensor encryption/decryption API is dtype agnostic, so a tensor of any dtype can be encrypted and the result can be stored to a tensor of any dtype. An encryption key also can be a tensor of any dtype. Currently torchcsprng supports AES cipher with 128-bit key in two modes: ECB and CTR."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jC1O-C25vI0W"
+      },
+      "source": [
+        "!pip install torchcsprng==0.2.0 torch==1.8.0 -f https://download.pytorch.org/whl/cu101/torch_stable.html"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "su2RWWdOrWFU"
+      },
+      "source": [
+        "import torch\n",
+        "import torchcsprng as csprng"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "NHTOLPZ_3254"
+      },
+      "source": [
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "17L0sgmy0R6o"
+      },
+      "source": [
+        "torchcsprng implementation of AES with 128 bit key requires to have a key tensor of 16 bytes but of any dtype"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "rw7WYZ-50To9"
+      },
+      "source": [
+        "key = torch.empty(16, dtype=torch.uint8, device=device).random_(0, 256)\n",
+        "key"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "RRfvyfHM4MY1"
+      },
+      "source": [
+        "Alternatively it can be a tensor of 8 elements of `torch.int16` or even 4 elements of `torch.float32`"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rCy01t1-0dtO"
+      },
+      "source": [
+        "The size of input tensor is 42 * (32/8) = 168 bytes. AES 128 operates with 16-bytes blocks, so zero-padding of 8 bytes will be used to form 176 bytes(eleven 16-bytes blocks)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LcuVmhyU0WTn"
+      },
+      "source": [
+        "initial = torch.empty(42, dtype=torch.float32, device=device).normal_(-24.0, 42.0)\n",
+        "initial"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rPNq2u4e3tlJ"
+      },
+      "source": [
+        "torchcsprng requires output tensor to be of the same size in bytes as input tensor rounded up to 16 bytes(AES 128 block size), so if `torch.int64` is dtype of the destination tensor size must be 176 / (64/8) = 22"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RAJya9GT0gb4"
+      },
+      "source": [
+        "encrypted = torch.empty(22, dtype=torch.int64, device=device)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-DCI4QOh4oGX"
+      },
+      "source": [
+        "Call `torchcsprng.encrypt` to encrypt `initial` tensor in [ECB](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB)) mode with 128-bit `key` tensor and store the result to `encrypted` tensor."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "TK4OjPRq4lsJ"
+      },
+      "source": [
+        "csprng.encrypt(initial, encrypted, key, \"aes128\", \"ecb\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yXUAwFHh5PSy"
+      },
+      "source": [
+        "Create an output tensor"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4LtJ-kD446DJ"
+      },
+      "source": [
+        "decrypted = torch.empty_like(initial)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8VcF04mf6Rn5"
+      },
+      "source": [
+        "Call `torchcsprng.decrypt` to decrypt `encrypted` tensor in ECB mode with 128-bit `key` tensor and store the result to `decrypted` tensor."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "kojXCFGK5v6l"
+      },
+      "source": [
+        "csprng.decrypt(encrypted, decrypted, key, \"aes128\", \"ecb\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9dEBSPD6EFSu"
+      },
+      "source": [
+        "Let's check that `decrypted` equals to `initial`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yOc1ftnM5yyj"
+      },
+      "source": [
+        "assert (decrypted == initial).all()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "cQWyteLlE4mQ"
+      },
+      "source": [
+        "Another example is to use [CTR](https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR)) mode with 128-bit `key` tensor of 4 elements of dtype `dtype=torch.float32`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZFInqYawD7ks"
+      },
+      "source": [
+        "key = torch.empty(4, dtype=torch.float32, device=device).random_()\n",
+        "key"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "FRz94NaZGyRS"
+      },
+      "source": [
+        "Let's encrypt 100 elements `torch.bool` tensor and store the result in 56 elements `torch.int16` tensor:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "8uiqxiehF_is"
+      },
+      "source": [
+        "initial = torch.empty(100, dtype=torch.bool, device=device).random_()\n",
+        "initial"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "G0URlmQYGfcW"
+      },
+      "source": [
+        "encrypted = torch.empty(56, dtype=torch.int16, device=device)\n",
+        "csprng.encrypt(initial, encrypted, key, \"aes128\", \"ctr\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U9Zz2oXoHw9Q"
+      },
+      "source": [
+        "Decrypt it back and check that `decrypted` equals to `initial`:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "YXNcdUbXHoPC"
+      },
+      "source": [
+        "decrypted = torch.empty_like(initial)\n",
+        "csprng.decrypt(encrypted, decrypted, key, \"aes128\", \"ctr\")\n",
+        "decrypted"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ie7epw1SKrdQ"
+      },
+      "source": [
+        "assert (decrypted == initial).all()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
diff --git a/packaging/README.md b/packaging/README.md
new file mode 100644
index 0000000..20ff064
--- /dev/null
+++ b/packaging/README.md
@@ -0,0 +1,90 @@
+# Building torchcsprng packages for release
+
+## Anaconda packages
+
+### Linux
+
+```bash
+nvidia-docker run -it --ipc=host --rm -v $(pwd):/remote soumith/conda-cuda bash
+pushd remote/conda
+
+./build_csprng.sh 9.0
+./build_csprng.sh 10.0
+./build_csprng.sh cpu
+
+# copy packages over to /remote
+# exit docker
+# anaconda upload -u pytorch torchcsprng*.bz2
+```
+
+### OSX
+
+```bash
+# create a fresh anaconda environment / install and activate it
+conda install -y conda-build anaconda-client
+./build_csprng.sh cpu
+
+# copy packages over to /remote
+# exit docker
+# anaconda upload -u pytorch torchcsprng*.bz2
+```
+
+### Windows
+
+```bash
+# Open `Git Bash` and change dir to `conda`
+./build_csprng.sh 9.0
+./build_csprng.sh 10.0
+./build_csprng.sh cpu
+
+# copy packages to a output directory
+# anaconda upload -u pytorch torchcsprng*.bz2
+```
+
+## Wheels
+
+### Linux
+
+pushd wheel
+
+```bash
+nvidia-docker run -it --ipc=host --rm -v $(pwd):/remote soumith/manylinux-cuda90:latest bash
+cd remote
+./linux_manywheel.sh cu90
+
+rm -rf /usr/local/cuda*
+./linux_manywheel.sh cpu
+```
+
+```bash
+nvidia-docker run -it --ipc=host --rm -v $(pwd):/remote soumith/manylinux-cuda100:latest bash
+cd remote
+./linux_manywheel.sh cu100
+```
+
+wheels are in the folders `cpu`, `cu90`, `cu100`.
+
+You can upload the `cu90` wheels to twine with `twine upload *.whl`.
+Which wheels we upload depends on which wheels PyTorch uploads as default, and right now, it's `cu90`.
+
+### OSX
+
+```bash
+pushd wheel
+./osx_wheel.sh
+```
+
+### Windows
+
+```cmd
+set PYTORCH_REPO=pytorch
+
+pushd windows
+call build_csprng.bat 90 0.3.0 1
+call build_csprng.bat 100 0.3.0 1
+call build_csprng.bat cpu 0.3.0 1
+```
+
+wheels are in the current folder.
+
+You can upload them to twine with `twine upload *.whl`
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
index c628f6e..e0e096d 100755
--- a/packaging/build_conda.sh
+++ b/packaging/build_conda.sh
@@ -5,10 +5,10 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 . "$script_dir/pkg_helpers.bash"
 
 export BUILD_TYPE=conda
-setup_env 0.1.0
+setup_env $(cat "version.txt" | sed "s/\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/g")
 export SOURCE_ROOT_DIR="$PWD"
 setup_conda_pytorch_constraint
 setup_conda_cudatoolkit_constraint
 setup_visual_studio_constraint
 setup_junit_results_folder
-conda build $CONDA_CHANNEL_FLAGS -c defaults -c conda-forge --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torch_csprng
+conda build $CONDA_CHANNEL_FLAGS -c defaults -c conda-forge --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchcsprng
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
index 98726cd..15b85a4 100755
--- a/packaging/build_wheel.sh
+++ b/packaging/build_wheel.sh
@@ -5,14 +5,50 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 . "$script_dir/pkg_helpers.bash"
 
 export BUILD_TYPE=wheel
-setup_env 0.1.0
+setup_env $(cat "version.txt" | sed "s/\([0-9]*\.[0-9]*\.[0-9]*\).*/\1/g")
 setup_wheel_python
 pip_install numpy pyyaml future ninja
 setup_pip_pytorch_version
 python setup.py clean
 
+# Copy binaries to be included in the wheel distribution
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+    python_exec="$(which python)"
+    bin_path=$(dirname $python_exec)
+    env_path=$(dirname $bin_path)
+    if [[ "$(uname)" == Darwin ]]; then
+        # Install delocate to relocate the required binaries
+        pip_install delocate
+    fi
+else
+    # Install auditwheel to get some inspection utilities
+    pip_install auditwheel
+
+    # Point to custom libraries
+    export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
+    export TORCHCSPRNG_INCLUDE=$(pwd)/ext_libraries/include
+    export TORCHCSPRNG_LIBRARY=$(pwd)/ext_libraries/lib
+fi
+
 if [[ "$OSTYPE" == "msys" ]]; then
     IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
 else
     IS_WHEEL=1 python setup.py bdist_wheel
 fi
+
+
+if [[ "$(uname)" == Darwin ]]; then
+    pushd dist/
+    python_exec="$(which python)"
+    bin_path=$(dirname $python_exec)
+    env_path=$(dirname $bin_path)
+    for whl in *.whl; do
+        DYLD_LIBRARY_PATH="$env_path/lib/:$DYLD_LIBRARY_PATH" delocate-wheel -v $whl
+    done
+else
+    if [[ "$OSTYPE" == "msys" ]]; then
+        "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py
+    else
+        LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py
+    fi
+fi
diff --git a/packaging/conda/build_csprng.sh b/packaging/conda/build_csprng.sh
new file mode 100755
index 0000000..44fc0af
--- /dev/null
+++ b/packaging/conda/build_csprng.sh
@@ -0,0 +1,229 @@
+#!/usr/bin/env bash
+if [[ -x "/remote/anaconda_token" ]]; then
+    . /remote/anaconda_token || true
+fi
+
+set -ex
+
+if [[ "$CIRCLECI" == 'true' ]]; then
+    export PATH="/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sbin:/bin:/sbin:.:$PATH"
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Parse arguments and determmine version
+###########################################################
+if [[ -n "$DESIRED_CUDA" && -n "$TORCHCSPRNG_BUILD_VERSION" && -n "$TORCHCSPRNG_BUILD_NUMBER" ]]; then
+    desired_cuda="$DESIRED_CUDA"
+    build_version="$PYTORCH_BUILD_VERSION"
+    build_number="$PYTORCH_BUILD_NUMBER"
+else
+    if [ "$#" -ne 3 ]; then
+        echo "Illegal number of parameters. Pass cuda version, pytorch version, build number"
+        echo "CUDA version should be Mm with no dot, e.g. '80'"
+        echo "DESIRED_PYTHON should be M.m, e.g. '2.7'"
+        exit 1
+    fi
+
+    desired_cuda="$1"
+    build_version="$2"
+    build_number="$3"
+fi
+if [[ "$desired_cuda" != cpu ]]; then
+  desired_cuda="$(echo $desired_cuda | tr -d cuda. )"
+fi
+echo "Building cuda version $desired_cuda and torchcsprng version: $build_version build_number: $build_number"
+
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    cpu_only=1
+    cuver="cpu"
+else
+    # Switch desired_cuda to be M.m to be consistent with other scripts in
+    # pytorch/builder
+    export FORCE_CUDA=1
+    cuda_nodot="$desired_cuda"
+
+    if [[ ${#cuda_nodot} -eq 2 ]]; then
+        desired_cuda="${desired_cuda:0:1}.${desired_cuda:1:1}"
+    elif [[ ${#cuda_nodot} -eq 3 ]]; then
+        desired_cuda="${desired_cuda:0:2}.${desired_cuda:2:1}"
+    else
+        echo "unknown cuda version $cuda_nodot"
+        exit 1
+    fi
+
+    cuver="cu$cuda_nodot"
+fi
+
+export TORCHCSPRNG_BUILD_VERSION=$build_version
+export TORCHCSPRNG_BUILD_NUMBER=$build_number
+
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    DESIRED_PYTHON=('3.5' '3.6' '3.7')
+fi
+
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+if [[ -z "$WIN_PACKAGE_WORK_DIR" ]]; then
+    WIN_PACKAGE_WORK_DIR="$(echo $(pwd -W) | tr '/' '\\')\\tmp_conda_$(date +%H%M%S)"
+fi
+
+mkdir -p "$WIN_PACKAGE_WORK_DIR" || true
+csprng_rootdir="$(realpath ${WIN_PACKAGE_WORK_DIR})/torchcsprng-src"
+git config --system core.longpaths true
+
+if [[ ! -d "$csprng_rootdir" ]]; then
+    rm -rf "$csprng_rootdir"
+    git clone "https://github.com/pytorch/csprng" "$csprng_rootdir"
+    pushd "$csprng_rootdir"
+    git checkout $PYTORCH_BRANCH
+    popd
+fi
+
+cd "$SOURCE_DIR"
+
+export tmp_conda="${WIN_PACKAGE_WORK_DIR}\\conda"
+export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe"
+rm -rf "$tmp_conda"
+rm -f "$miniconda_exe"
+curl -sSk https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "$miniconda_exe"
+"$SOURCE_DIR/install_conda.bat" && rm "$miniconda_exe"
+pushd $tmp_conda
+export PATH="$(pwd):$(pwd)/Library/usr/bin:$(pwd)/Library/bin:$(pwd)/Scripts:$(pwd)/bin:$PATH"
+popd
+retry conda install -yq conda-build
+
+ANACONDA_USER=pytorch-nightly
+conda config --set anaconda_upload no
+
+
+export TORCHCSPRNG_PACKAGE_SUFFIX=""
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+    export CONDA_CPUONLY_FEATURE="- cpuonly # [not osx]"
+    export CUDA_VERSION="None"
+else
+    export CONDA_CPUONLY_FEATURE=""
+    . ./switch_cuda_version.sh $desired_cuda
+    if [[ "$desired_cuda" == "10.2" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
+    elif [[ "$desired_cuda" == "10.1" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+    elif [[ "$desired_cuda" == "10.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+    elif [[ "$desired_cuda" == "9.2" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+    elif [[ "$desired_cuda" == "9.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.0,<9.1 # [not osx]"
+    elif [[ "$desired_cuda" == "8.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=8.0,<8.1 # [not osx]"
+    else
+        echo "unhandled desired_cuda: $desired_cuda"
+        exit 1
+    fi
+fi
+
+if [[ -z "$PYTORCH_VERSION" ]]; then
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch"
+    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
+                                python -c "import os, sys, json, re; cuver = '$cuver'; \
+                                cuver = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                                print(re.sub(r'\\+.*$', '', \
+                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
+                                    if (x['platform'] == 'darwin' or cuver in x['fn']) \
+                                    and 'py' + os.environ['DESIRED_PYTHON'] in x['fn']][-1]))")"
+    if [[ -z "$PYTORCH_VERSION" ]]; then
+        echo "PyTorch version auto detection failed"
+        echo "No package found for desired_cuda=$desired_cuda and DESIRED_PYTHON=$DESIRED_PYTHON"
+        exit 1
+    fi
+else
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly"
+fi
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+else
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+fi
+
+# Loop through all Python versions to build a package for each
+for py_ver in "${DESIRED_PYTHON[@]}"; do
+    build_string="py${py_ver}_${build_string_suffix}"
+    folder_tag="${build_string}_$(date +'%Y%m%d')"
+
+    # Create the conda package into this temporary folder. This is so we can find
+    # the package afterwards, as there's no easy way to extract the final filename
+    # from conda-build
+    output_folder="out_$folder_tag"
+    rm -rf "$output_folder"
+    mkdir "$output_folder"
+
+    if [[ "$py_ver" == 3.5 ]]; then
+      export CONDA_TYPING_CONSTRAINT="- typing"
+    else
+      export CONDA_TYPING_CONSTRAINT=""
+    fi
+
+    export VSTOOLCHAIN_PACKAGE=vs2017
+
+    # We need to build the compiler activation scripts first on Windows
+    time VSDEVCMD_ARGS=${VSDEVCMD_ARGS[@]} \
+        conda build -c "$ANACONDA_USER" \
+                    --no-anaconda-upload \
+                    --output-folder "$output_folder" \
+                    ../$VSTOOLCHAIN_PACKAGE
+
+    cp ../$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml ../torchcsprng/conda_build_config.yaml
+
+    conda config --set anaconda_upload no
+    echo "Calling conda-build at $(date)"
+    if [[ "$desired_cuda" == "9.2" ]]; then
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$TORCHCSPRNG_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$csprng_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        -c "numba/label/dev" \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../torchcsprng
+    else
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$TORCHCSPRNG_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$csprng_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../torchcsprng
+    fi
+    echo "Finished conda-build at $(date)"
+
+    # Extract the package for testing
+    ls -lah "$output_folder"
+    built_package="$(find $output_folder/ -name '*torchcsprng*.tar.bz2')"
+
+    # Copy the built package to the host machine for persistence before testing
+    if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+        mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+        cp "$built_package" "$PYTORCH_FINAL_PACKAGE_DIR/"
+    fi
+done
+
+
+set +e
diff --git a/packaging/conda/install_conda.bat b/packaging/conda/install_conda.bat
new file mode 100644
index 0000000..6052ad0
--- /dev/null
+++ b/packaging/conda/install_conda.bat
@@ -0,0 +1 @@
+start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/packaging/conda/switch_cuda_version.sh b/packaging/conda/switch_cuda_version.sh
new file mode 100755
index 0000000..342def9
--- /dev/null
+++ b/packaging/conda/switch_cuda_version.sh
@@ -0,0 +1,28 @@
+if [[ "$OSTYPE" == "msys" ]]; then
+    CUDA_DIR="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v$1"
+else
+    CUDA_DIR="/usr/local/cuda-$1"
+fi
+
+if ! ls "$CUDA_DIR"
+then
+    echo "folder $CUDA_DIR not found to switch"
+fi
+
+echo "Switching symlink to $CUDA_DIR"
+mkdir -p /usr/local
+rm -fr /usr/local/cuda
+ln -s "$CUDA_DIR" /usr/local/cuda
+
+if [[ "$OSTYPE" == "msys" ]]; then
+    export CUDA_VERSION=`ls /usr/local/cuda/bin/cudart64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+    export CUDNN_VERSION=`ls /usr/local/cuda/bin/cudnn64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+else
+    export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+    export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+fi
+
+ls -alh /usr/local/cuda
+
+echo "CUDA_VERSION=$CUDA_VERSION"
+echo "CUDNN_VERSION=$CUDNN_VERSION"
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
index 8181bf2..dad9622 100644
--- a/packaging/pkg_helpers.bash
+++ b/packaging/pkg_helpers.bash
@@ -14,8 +14,8 @@
 #   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
 #   WHEEL_DIR (e.g., cu100/)
 #   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torch_csprng setup.py)
-#   NVCC_FLAGS (respected by torch_csprng setup.py)
+#   FORCE_CUDA (respected by torchcsprng setup.py)
+#   NVCC_FLAGS (respected by torchcsprng setup.py)
 #
 # Precondition: CUDA versions are installed in their conventional locations in
 # /usr/local/cuda-*
@@ -49,6 +49,39 @@ setup_cuda() {
 
   # Now work out the CUDA settings
   case "$CU_VERSION" in
+    cu112)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.2/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu111)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.1/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu110)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.0"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.0/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_50,code=compute_50"
+      ;;
     cu102)
       if [[ "$OSTYPE" == "msys" ]]; then
         export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
@@ -121,7 +154,7 @@ setup_build_version() {
   # Set build version based on tag if on tag
   if [[ -n "${CIRCLE_TAG}" ]]; then
     # Strip tag
-    export BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')"
+    export BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
   fi
 }
 
@@ -170,11 +203,7 @@ setup_wheel_python() {
     conda env remove -n "env$PYTHON_VERSION" || true
     conda create -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
     conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install libpng jpeg -y
   else
-    # Install native CentOS libPNG
-    yum install -y libpng-devel libjpeg-turbo-devel
     case "$PYTHON_VERSION" in
       2.7)
         if [[ -n "$UNICODE_ABI" ]]; then
@@ -187,12 +216,19 @@ setup_wheel_python() {
       3.6) python_abi=cp36-cp36m ;;
       3.7) python_abi=cp37-cp37m ;;
       3.8) python_abi=cp38-cp38 ;;
+      3.9) python_abi=cp39-cp39 ;;
       *)
         echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
         exit 1
         ;;
     esac
-    export PATH="/opt/python/$python_abi/bin:$PATH"
+    # Download all the dependencies required to compile image and video_reader
+    # extensions
+
+    mkdir -p ext_libraries
+    pushd ext_libraries
+    popd
+    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
   fi
 }
 
@@ -217,9 +253,8 @@ setup_pip_pytorch_version() {
     fi
   else
     pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f https://download.pytorch.org/whl/torch_stable.html \
-      -f https://download.pytorch.org/whl/test/torch_test.html \
-      -f https://download.pytorch.org/whl/nightly/torch_nightly.html
+      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
+      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
   fi
 }
 
@@ -229,7 +264,7 @@ setup_pip_pytorch_version() {
 # You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
 setup_conda_pytorch_constraint() {
   if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch"
     export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
                               python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
                                cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
@@ -244,7 +279,7 @@ setup_conda_pytorch_constraint() {
       exit 1
     fi
   else
-    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly -c pytorch-test"
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-${UPLOAD_CHANNEL}"
   fi
   if [[ "$CU_VERSION" == cpu ]]; then
     export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
@@ -265,6 +300,15 @@ setup_conda_cudatoolkit_constraint() {
     export CONDA_CUDATOOLKIT_CONSTRAINT=""
   else
     case "$CU_VERSION" in
+      cu112)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.2,<11.3 # [not osx]"
+        ;;
+      cu111)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.1,<11.2 # [not osx]"
+        ;;
+      cu110)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"
+        ;;
       cu102)
         export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
         ;;
@@ -289,12 +333,45 @@ setup_conda_cudatoolkit_constraint() {
   fi
 }
 
+setup_conda_cudatoolkit_plain_constraint() {
+  export CONDA_CPUONLY_FEATURE=""
+  export CMAKE_USE_CUDA=1
+  if [[ "$(uname)" == Darwin ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+    export CMAKE_USE_CUDA=0
+  else
+    case "$CU_VERSION" in
+      cu102)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.2"
+        ;;
+      cu101)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.1"
+        ;;
+      cu100)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.0"
+        ;;
+      cu92)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=9.2"
+        ;;
+      cpu)
+        export CONDA_CUDATOOLKIT_CONSTRAINT=""
+        export CONDA_CPUONLY_FEATURE="cpuonly"
+        export CMAKE_USE_CUDA=0
+        ;;
+      *)
+        echo "Unrecognized CU_VERSION=$CU_VERSION"
+        exit 1
+        ;;
+    esac
+  fi
+}
+
 # Build the proper compiler package before building the final package
 setup_visual_studio_constraint() {
   if [[ "$OSTYPE" == "msys" ]]; then
       export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
       conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torch_csprng/conda_build_config.yaml
+      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchcsprng/conda_build_config.yaml
   fi
 }
 
diff --git a/packaging/torchcsprng/bld.bat b/packaging/torchcsprng/bld.bat
new file mode 100644
index 0000000..8c7c833
--- /dev/null
+++ b/packaging/torchcsprng/bld.bat
@@ -0,0 +1,27 @@
+@echo on
+
+set TORCHCSPRNG_BUILD_VERSION=%PKG_VERSION%
+set TORCHCSPRNG_BUILD_NUMBER=%PKG_BUILDNUM%
+
+set build_with_cuda=
+
+if "%CUDA_VERSION%" == "None" goto cuda_flags_end
+if "%CUDA_VERSION%" == "cpu" goto cuda_flags_end
+if "%CUDA_VERSION%" == "" goto cuda_flags_end
+
+set build_with_cuda=1
+set desired_cuda=%CUDA_VERSION:~0,-1%.%CUDA_VERSION:~-1,1%
+
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%desired_cuda%
+set CUDA_BIN_PATH=%CUDA_PATH%\bin
+set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr
+if "%desired_cuda%" == "9.0" set NVCC_FLAGS=%NVCC_FLAGS% -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50
+if "%desired_cuda%" == "9.2" set NVCC_FLAGS=%NVCC_FLAGS% -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50
+if "%desired_cuda%" == "10.0" set NVCC_FLAGS=%NVCC_FLAGS% -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+if "%desired_cuda%" == "10.1" set NVCC_FLAGS=%NVCC_FLAGS% -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+if "%desired_cuda%" == "10.2" set NVCC_FLAGS=%NVCC_FLAGS% -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+
+:cuda_flags_end
+
+python setup.py install --single-version-externally-managed --record=record.txt
+if errorlevel 1 exit /b 1
diff --git a/packaging/torchcsprng/conda_build_config.yaml b/packaging/torchcsprng/conda_build_config.yaml
new file mode 100644
index 0000000..257515c
--- /dev/null
+++ b/packaging/torchcsprng/conda_build_config.yaml
@@ -0,0 +1,26 @@
+channel_sources:
+  - pytorch-nightly,pytorch,defaults
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2017                     # [win]
+cxx_compiler:
+  - vs2017                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
diff --git a/packaging/torch_csprng/meta.yaml b/packaging/torchcsprng/meta.yaml
similarity index 95%
rename from packaging/torch_csprng/meta.yaml
rename to packaging/torchcsprng/meta.yaml
index 41be147..1b4570d 100644
--- a/packaging/torch_csprng/meta.yaml
+++ b/packaging/torchcsprng/meta.yaml
@@ -1,5 +1,5 @@
 package:
-  name: torch_csprng
+  name: torchcsprng
   version: "{{ environ.get('BUILD_VERSION') }}"
 
 source:
@@ -39,12 +39,13 @@ build:
 #test:
 #  imports:
 #    - torch
-#    - torch_csprng
+#    - torchcsprng
 #  source_files:
 #    - test
 #  requires:
 #    - pytest
 #    - scipy
+#    - pycrypto
 #  commands:
 #    pytest . --verbose
 
diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat
new file mode 100644
index 0000000..ccecfc2
--- /dev/null
+++ b/packaging/vs2017/activate.bat
@@ -0,0 +1,44 @@
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=15.0"
+SET "VS_MAJOR=15"
+SET "VS_YEAR=2017"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
diff --git a/packaging/torch_csprng/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml
similarity index 93%
rename from packaging/torch_csprng/conda_build_config.yaml
rename to packaging/vs2017/conda_build_config.yaml
index dd426d8..5188bb0 100644
--- a/packaging/torch_csprng/conda_build_config.yaml
+++ b/packaging/vs2017/conda_build_config.yaml
@@ -1,3 +1,5 @@
+blas_impl:
+  - mkl                        # [x86_64]
 c_compiler:
   - vs2017                     # [win]
 cxx_compiler:
diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat
new file mode 100644
index 0000000..de0e6ff
--- /dev/null
+++ b/packaging/vs2017/install_activate.bat
@@ -0,0 +1,30 @@
+set YEAR=2017
+set VER=15
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
+
diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat
new file mode 100644
index 0000000..5163c16
--- /dev/null
+++ b/packaging/vs2017/install_runtime.bat
@@ -0,0 +1,49 @@
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2017
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2017
+set "VC_VER=141"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml
new file mode 100644
index 0000000..1f56952
--- /dev/null
+++ b/packaging/vs2017/meta.yaml
@@ -0,0 +1,24 @@
+{% set vcver="14.1" %}
+{% set vcfeature="14" %}
+{% set vsyear="2017" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
+      strong:
+        - vc{{ vcfeature }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
diff --git a/packaging/vs2019/activate.bat b/packaging/vs2019/activate.bat
new file mode 100644
index 0000000..6f607ba
--- /dev/null
+++ b/packaging/vs2019/activate.bat
@@ -0,0 +1,44 @@
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=16.0"
+SET "VS_MAJOR=16"
+SET "VS_YEAR=2019"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml
new file mode 100644
index 0000000..358052e
--- /dev/null
+++ b/packaging/vs2019/conda_build_config.yaml
@@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2019                     # [win]
+cxx_compiler:
+  - vs2019                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
diff --git a/packaging/vs2019/install_activate.bat b/packaging/vs2019/install_activate.bat
new file mode 100644
index 0000000..3c38253
--- /dev/null
+++ b/packaging/vs2019/install_activate.bat
@@ -0,0 +1,30 @@
+set YEAR=2019
+set VER=16
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
+
diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat
new file mode 100644
index 0000000..e09a5cc
--- /dev/null
+++ b/packaging/vs2019/install_runtime.bat
@@ -0,0 +1,49 @@
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2019
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2019
+set "VC_VER=142"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
diff --git a/packaging/vs2019/meta.yaml b/packaging/vs2019/meta.yaml
new file mode 100644
index 0000000..94a0ed4
--- /dev/null
+++ b/packaging/vs2019/meta.yaml
@@ -0,0 +1,24 @@
+{% set vcver="14.2" %}
+{% set vcfeature="14" %}
+{% set vsyear="2019" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
+      strong:
+        - vc{{ vcfeature }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
diff --git a/packaging/wheel/linux_manywheel.sh b/packaging/wheel/linux_manywheel.sh
new file mode 100644
index 0000000..d6471aa
--- /dev/null
+++ b/packaging/wheel/linux_manywheel.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -ex
+
+if [ "$#" -ne 1 ]; then
+    echo "Illegal number of parameters. Pass cuda version"
+    echo "CUDA version should be cu92, cu100 or cpu"
+    exit 1
+fi
+export CUVER="$1" # cu[0-9]* cpu
+
+if [[ "$CUVER" == "cu102" ]]; then
+  cu_suffix=""
+else
+  cu_suffix="+$CUVER"
+fi
+
+export TORCHCSPRNG_BUILD_VERSION="0.4.0.dev$(date "+%Y%m%d")${cu_suffix}"
+export TORCHCSPRNG_BUILD_NUMBER="1"
+export TORCHCSPRNG_LOCAL_VERSION_LABEL="$CUVER"
+export OUT_DIR="/remote/$CUVER"
+
+pushd /opt/python
+DESIRED_PYTHON=(*/)
+popd
+for desired_py in "${DESIRED_PYTHON[@]}"; do
+    python_installations+=("/opt/python/$desired_py")
+done
+
+OLD_PATH=$PATH
+cd /tmp
+rm -rf csprng
+git clone https://github.com/pytorch/csprng
+
+cd /tmp/csprng
+
+for PYDIR in "${python_installations[@]}"; do
+    export PATH=$PYDIR/bin:$OLD_PATH
+    pip install --upgrade pip
+    pip install numpy pyyaml future
+
+    pip uninstall -y torch || true
+    pip uninstall -y torch_nightly || true
+
+    export TORCHCSPRNG_PYTORCH_DEPENDENCY_NAME=torch_nightly
+    pip install torch_nightly -f https://download.pytorch.org/whl/nightly/$CUVER/torch_nightly.html
+    # CPU/CUDA variants of PyTorch have ABI compatible PyTorch for
+    # the CPU only bits.  Therefore, we
+    # strip off the local package qualifier, but ONLY if we're
+    # doing a CPU build.
+    if [[ "$CUVER" == "cpu" ]]; then
+        export TORCHCSPRNG_PYTORCH_DEPENDENCY_VERSION="$(pip show torch_nightly | grep ^Version: | sed 's/Version: \+//' | sed 's/+.\+//')"
+    else
+        export TORCHCSPRNG_PYTORCH_DEPENDENCY_VERSION="$(pip show torch_nightly | grep ^Version: | sed 's/Version: \+//')"
+    fi
+    echo "Building against ${TORCHCSPRNG_PYTORCH_DEPENDENCY_VERSION}"
+
+    pip install ninja
+    python setup.py clean
+    python setup.py bdist_wheel
+    mkdir -p $OUT_DIR
+    cp dist/*.whl $OUT_DIR/
+done
diff --git a/packaging/wheel/osx_wheel.sh b/packaging/wheel/osx_wheel.sh
new file mode 100644
index 0000000..566f956
--- /dev/null
+++ b/packaging/wheel/osx_wheel.sh
@@ -0,0 +1,52 @@
+if [[ ":$PATH:" == *"conda"* ]]; then
+    echo "existing anaconda install in PATH, remove it and run script"
+    exit 1
+fi
+# download and activate anaconda
+rm -rf ~/minconda_wheel_env_tmp
+wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh && \
+    chmod +x Miniconda3-latest-MacOSX-x86_64.sh && \
+    ./Miniconda3-latest-MacOSX-x86_64.sh -b -p ~/minconda_wheel_env_tmp && \
+    rm Miniconda3-latest-MacOSX-x86_64.sh
+
+. ~/minconda_wheel_env_tmp/bin/activate
+
+
+export TORCHCSPRNG_BUILD_VERSION="0.4.0.dev$(date "+%Y%m%d")"
+export TORCHCSPRNG_BUILD_NUMBER="1"
+export OUT_DIR=~/torchcsprng_wheels
+
+export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
+
+pushd /tmp
+rm -rf csprng
+git clone https://github.com/pytorch/csprng
+pushd csprng
+
+desired_pythons=( "2.7" "3.5" "3.6" "3.7" )
+# for each python
+for desired_python in "${desired_pythons[@]}"
+do
+    # create and activate python env
+    env_name="env$desired_python"
+    conda create -yn $env_name python="$desired_python"
+    conda activate $env_name
+
+    pip uninstall -y torch || true
+    pip uninstall -y torch_nightly || true
+
+    export TORCHCSPRNG_PYTORCH_DEPENDENCY_NAME=torch_nightly
+    pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+    export TORCHCSPRNG_PYTORCH_DEPENDENCY_VERSION="$(pip show torch_nightly | grep ^Version: | sed 's/Version:  *//')"
+    echo "Building against ${TORCHAUDIO_PYTORCH_DEPENDENCY_VERSION}"
+
+    # install torchcsprng dependencies
+    pip install ninja scipy pytest pycrypto
+
+    python setup.py clean
+    python setup.py bdist_wheel
+    mkdir -p $OUT_DIR
+    cp dist/*.whl $OUT_DIR/
+done
+popd
+popd
diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
new file mode 100644
index 0000000..fd92b68
--- /dev/null
+++ b/packaging/wheel/relocate.py
@@ -0,0 +1,408 @@
+# -*- coding: utf-8 -*-
+
+"""Helper script to package wheels and relocate binaries."""
+
+import glob
+import hashlib
+import io
+
+# Standard library imports
+import os
+import os.path as osp
+import platform
+import shutil
+import subprocess
+import sys
+import zipfile
+from base64 import urlsafe_b64encode
+
+# Third party imports
+if sys.platform == "linux":
+    from auditwheel.lddtree import lddtree
+from wheel.bdist_wheel import get_abi_tag
+
+
+ALLOWLIST = {
+    "libgcc_s.so.1",
+    "libstdc++.so.6",
+    "libm.so.6",
+    "libdl.so.2",
+    "librt.so.1",
+    "libc.so.6",
+    "libnsl.so.1",
+    "libutil.so.1",
+    "libpthread.so.0",
+    "libresolv.so.2",
+    "libX11.so.6",
+    "libXext.so.6",
+    "libXrender.so.1",
+    "libICE.so.6",
+    "libSM.so.6",
+    "libGL.so.1",
+    "libgobject-2.0.so.0",
+    "libgthread-2.0.so.0",
+    "libglib-2.0.so.0",
+    "ld-linux-x86-64.so.2",
+    "ld-2.17.so",
+}
+
+WINDOWS_ALLOWLIST = {
+    "MSVCP140.dll",
+    "KERNEL32.dll",
+    "VCRUNTIME140_1.dll",
+    "VCRUNTIME140.dll",
+    "api-ms-win-crt-heap-l1-1-0.dll",
+    "api-ms-win-crt-runtime-l1-1-0.dll",
+    "api-ms-win-crt-stdio-l1-1-0.dll",
+    "api-ms-win-crt-filesystem-l1-1-0.dll",
+    "api-ms-win-crt-string-l1-1-0.dll",
+    "api-ms-win-crt-environment-l1-1-0.dll",
+    "api-ms-win-crt-math-l1-1-0.dll",
+    "api-ms-win-crt-convert-l1-1-0.dll",
+}
+
+
+HERE = osp.dirname(osp.abspath(__file__))
+PACKAGE_ROOT = osp.dirname(osp.dirname(HERE))
+PLATFORM_ARCH = platform.machine()
+PYTHON_VERSION = sys.version_info
+
+
+def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE):
+    """Yield pieces of data from a file-like object until EOF."""
+    while True:
+        chunk = file.read(size)
+        if not chunk:
+            break
+        yield chunk
+
+
+def rehash(path, blocksize=1 << 20):
+    """Return (hash, length) for path using hashlib.sha256()"""
+    h = hashlib.sha256()
+    length = 0
+    with open(path, "rb") as f:
+        for block in read_chunks(f, size=blocksize):
+            length += len(block)
+            h.update(block)
+    digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=")
+    # unicode/str python2 issues
+    return (digest, str(length))  # type: ignore
+
+
+def unzip_file(file, dest):
+    """Decompress zip `file` into directory `dest`."""
+    with zipfile.ZipFile(file, "r") as zip_ref:
+        zip_ref.extractall(dest)
+
+
+def is_program_installed(basename):
+    """
+    Return program absolute path if installed in PATH.
+    Otherwise, return None
+    On macOS systems, a .app is considered installed if
+    it exists.
+    """
+    if sys.platform == "darwin" and basename.endswith(".app") and osp.exists(basename):
+        return basename
+
+    for path in os.environ["PATH"].split(os.pathsep):
+        abspath = osp.join(path, basename)
+        if osp.isfile(abspath):
+            return abspath
+
+
+def find_program(basename):
+    """
+    Find program in PATH and return absolute path
+    Try adding .exe or .bat to basename on Windows platforms
+    (return None if not found)
+    """
+    names = [basename]
+    if os.name == "nt":
+        # Windows platforms
+        extensions = (".exe", ".bat", ".cmd", ".dll")
+        if not basename.endswith(extensions):
+            names = [basename + ext for ext in extensions] + [basename]
+    for name in names:
+        path = is_program_installed(name)
+        if path:
+            return path
+
+
+def patch_new_path(library_path, new_dir):
+    library = osp.basename(library_path)
+    name, *rest = library.split(".")
+    rest = ".".join(rest)
+    hash_id = hashlib.sha256(library_path.encode("utf-8")).hexdigest()[:8]
+    new_name = ".".join([name, hash_id, rest])
+    return osp.join(new_dir, new_name)
+
+
+def find_dll_dependencies(dumpbin, binary):
+    out = subprocess.run([dumpbin, "/dependents", binary], stdout=subprocess.PIPE)
+    out = out.stdout.strip().decode("utf-8")
+    start_index = out.find("dependencies:") + len("dependencies:")
+    end_index = out.find("Summary")
+    dlls = out[start_index:end_index].strip()
+    dlls = dlls.split(os.linesep)
+    dlls = [dll.strip() for dll in dlls]
+    return dlls
+
+
+def relocate_elf_library(patchelf, output_dir, output_library, binary):
+    """
+    Relocate an ELF shared library to be packaged on a wheel.
+
+    Given a shared library, find the transitive closure of its dependencies,
+    rename and copy them into the wheel while updating their respective rpaths.
+    """
+
+    print("Relocating {0}".format(binary))
+    binary_path = osp.join(output_library, binary)
+
+    ld_tree = lddtree(binary_path)
+    tree_libs = ld_tree["libs"]
+
+    binary_queue = [(n, binary) for n in ld_tree["needed"]]
+    binary_paths = {binary: binary_path}
+    binary_dependencies = {}
+
+    while binary_queue != []:
+        library, parent = binary_queue.pop(0)
+        library_info = tree_libs[library]
+        print(library)
+
+        if library_info["path"] is None:
+            print("Omitting {0}".format(library))
+            continue
+
+        if library in ALLOWLIST:
+            # Omit glibc/gcc/system libraries
+            print("Omitting {0}".format(library))
+            continue
+
+        parent_dependencies = binary_dependencies.get(parent, [])
+        parent_dependencies.append(library)
+        binary_dependencies[parent] = parent_dependencies
+
+        if library in binary_paths:
+            continue
+
+        binary_paths[library] = library_info["path"]
+        binary_queue += [(n, library) for n in library_info["needed"]]
+
+    print("Copying dependencies to wheel directory")
+    new_libraries_path = osp.join(output_dir, "torchcsprng.libs")
+    os.makedirs(new_libraries_path)
+
+    new_names = {binary: binary_path}
+
+    for library in binary_paths:
+        if library != binary:
+            library_path = binary_paths[library]
+            new_library_path = patch_new_path(library_path, new_libraries_path)
+            print("{0} -> {1}".format(library, new_library_path))
+            shutil.copyfile(library_path, new_library_path)
+            new_names[library] = new_library_path
+
+    print("Updating dependency names by new files")
+    for library in binary_paths:
+        if library != binary:
+            if library not in binary_dependencies:
+                continue
+            library_dependencies = binary_dependencies[library]
+            new_library_name = new_names[library]
+            for dep in library_dependencies:
+                new_dep = osp.basename(new_names[dep])
+                print("{0}: {1} -> {2}".format(library, dep, new_dep))
+                subprocess.check_output(
+                    [patchelf, "--replace-needed", dep, new_dep, new_library_name],
+                    cwd=new_libraries_path,
+                )
+
+            print("Updating library rpath")
+            subprocess.check_output(
+                [patchelf, "--set-rpath", "$ORIGIN", new_library_name],
+                cwd=new_libraries_path,
+            )
+
+            subprocess.check_output(
+                [patchelf, "--print-rpath", new_library_name], cwd=new_libraries_path
+            )
+
+    print("Update library dependencies")
+    library_dependencies = binary_dependencies[binary]
+    for dep in library_dependencies:
+        new_dep = osp.basename(new_names[dep])
+        print("{0}: {1} -> {2}".format(binary, dep, new_dep))
+        subprocess.check_output(
+            [patchelf, "--replace-needed", dep, new_dep, binary], cwd=output_library
+        )
+
+    print("Update library rpath")
+    subprocess.check_output(
+        [patchelf, "--set-rpath", "$ORIGIN:$ORIGIN/../torchcsprng.libs", binary_path],
+        cwd=output_library,
+    )
+
+
+def relocate_dll_library(dumpbin, output_dir, output_library, binary):
+    """
+    Relocate a DLL/PE shared library to be packaged on a wheel.
+
+    Given a shared library, find the transitive closure of its dependencies,
+    rename and copy them into the wheel.
+    """
+    print("Relocating {0}".format(binary))
+    binary_path = osp.join(output_library, binary)
+
+    library_dlls = find_dll_dependencies(dumpbin, binary_path)
+    binary_queue = [(dll, binary) for dll in library_dlls]
+    binary_paths = {binary: binary_path}
+    binary_dependencies = {}
+
+    while binary_queue != []:
+        library, parent = binary_queue.pop(0)
+        if library in WINDOWS_ALLOWLIST or library.startswith("api-ms-win"):
+            print("Omitting {0}".format(library))
+            continue
+
+        library_path = find_program(library)
+        if library_path is None:
+            print("{0} not found".format(library))
+            continue
+
+        if osp.basename(osp.dirname(library_path)) == "system32":
+            continue
+
+        print("{0}: {1}".format(library, library_path))
+        parent_dependencies = binary_dependencies.get(parent, [])
+        parent_dependencies.append(library)
+        binary_dependencies[parent] = parent_dependencies
+
+        if library in binary_paths:
+            continue
+
+        binary_paths[library] = library_path
+        downstream_dlls = find_dll_dependencies(dumpbin, library_path)
+        binary_queue += [(n, library) for n in downstream_dlls]
+
+    print("Copying dependencies to wheel directory")
+    package_dir = osp.join(output_dir, "torchcsprng")
+    for library in binary_paths:
+        if library != binary:
+            library_path = binary_paths[library]
+            new_library_path = osp.join(package_dir, library)
+            print("{0} -> {1}".format(library, new_library_path))
+            shutil.copyfile(library_path, new_library_path)
+
+
+def compress_wheel(output_dir, wheel, wheel_dir, wheel_name):
+    """Create RECORD file and compress wheel distribution."""
+    print("Update RECORD file in wheel")
+    dist_info = glob.glob(osp.join(output_dir, "*.dist-info"))[0]
+    record_file = osp.join(dist_info, "RECORD")
+
+    with open(record_file, "w") as f:
+        for root, _, files in os.walk(output_dir):
+            for this_file in files:
+                full_file = osp.join(root, this_file)
+                rel_file = osp.relpath(full_file, output_dir)
+                if full_file == record_file:
+                    f.write("{0},,\n".format(rel_file))
+                else:
+                    digest, size = rehash(full_file)
+                    f.write("{0},{1},{2}\n".format(rel_file, digest, size))
+
+    print("Compressing wheel")
+    base_wheel_name = osp.join(wheel_dir, wheel_name)
+    shutil.make_archive(base_wheel_name, "zip", output_dir)
+    os.remove(wheel)
+    shutil.move("{0}.zip".format(base_wheel_name), wheel)
+    shutil.rmtree(output_dir)
+
+
+def patch_linux():
+    # Get patchelf location
+    patchelf = find_program("patchelf")
+    if patchelf is None:
+        raise FileNotFoundError(
+            "Patchelf was not found in the system, please"
+            " make sure that is available on the PATH."
+        )
+
+    # Find wheel
+    print("Finding wheels...")
+    wheels = glob.glob(osp.join(PACKAGE_ROOT, "dist", "*.whl"))
+    output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process")
+
+    image_binary = "image.so"
+    video_binary = "video_reader.so"
+    torchcsprng_binaries = [image_binary, video_binary]
+    for wheel in wheels:
+        if osp.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+        os.makedirs(output_dir)
+
+        print("Unzipping wheel...")
+        wheel_file = osp.basename(wheel)
+        wheel_dir = osp.dirname(wheel)
+        print("{0}".format(wheel_file))
+        wheel_name, _ = osp.splitext(wheel_file)
+        unzip_file(wheel, output_dir)
+
+        print("Finding ELF dependencies...")
+        output_library = osp.join(output_dir, "torchcsprng")
+        for binary in torchcsprng_binaries:
+            if osp.exists(osp.join(output_library, binary)):
+                relocate_elf_library(patchelf, output_dir, output_library, binary)
+
+        compress_wheel(output_dir, wheel, wheel_dir, wheel_name)
+
+
+def patch_win():
+    # Get dumpbin location
+    dumpbin = find_program("dumpbin")
+    if dumpbin is None:
+        raise FileNotFoundError(
+            "Dumpbin was not found in the system, please"
+            " make sure that is available on the PATH."
+        )
+
+    # Find wheel
+    print("Finding wheels...")
+    wheels = glob.glob(osp.join(PACKAGE_ROOT, "dist", "*.whl"))
+    output_dir = osp.join(PACKAGE_ROOT, "dist", ".wheel-process")
+
+    image_binary = "image.pyd"
+    video_binary = "video_reader.pyd"
+    torchcsprng_binaries = [image_binary, video_binary]
+    for wheel in wheels:
+        if osp.exists(output_dir):
+            shutil.rmtree(output_dir)
+
+        os.makedirs(output_dir)
+
+        print("Unzipping wheel...")
+        wheel_file = osp.basename(wheel)
+        wheel_dir = osp.dirname(wheel)
+        print("{0}".format(wheel_file))
+        wheel_name, _ = osp.splitext(wheel_file)
+        unzip_file(wheel, output_dir)
+
+        print("Finding DLL/PE dependencies...")
+        output_library = osp.join(output_dir, "torchcsprng")
+        for binary in torchcsprng_binaries:
+            if osp.exists(osp.join(output_library, binary)):
+                relocate_dll_library(dumpbin, output_dir, output_library, binary)
+
+        compress_wheel(output_dir, wheel, wheel_dir, wheel_name)
+
+
+if __name__ == "__main__":
+    if sys.platform == "linux":
+        patch_linux()
+    elif sys.platform == "win32":
+        patch_win()
diff --git a/packaging/windows/azure-pipelines-ci.yml b/packaging/windows/azure-pipelines-ci.yml
new file mode 100644
index 0000000..6f9f346
--- /dev/null
+++ b/packaging/windows/azure-pipelines-ci.yml
@@ -0,0 +1,11 @@
+
+# Turn off auto builds for commits
+trigger: none
+pr: none
+
+jobs:
+- template: templates/build_task.yml
+  parameters:
+    package: 'Wheels'
+    spec: 'CPU'
+    msagent: true
diff --git a/packaging/windows/azure-pipelines.yml b/packaging/windows/azure-pipelines.yml
new file mode 100644
index 0000000..d024057
--- /dev/null
+++ b/packaging/windows/azure-pipelines.yml
@@ -0,0 +1,35 @@
+
+# Turn off auto builds for commits
+trigger: none
+pr: none
+
+jobs:
+- template: templates/auth_task.yml
+
+- template: templates/build_task.yml
+  parameters:
+    package: 'Wheels'
+    spec: 'CPU'
+    msagent: true
+
+- template: templates/build_task.yml
+  parameters:
+    package: 'Conda'
+    spec: 'CPU'
+    msagent: true
+
+- template: templates/build_task.yml
+  parameters:
+    package: 'Wheels'
+    spec: 'CUDA'
+    msagent: true
+
+- template: templates/build_task.yml
+  parameters:
+    package: 'Conda'
+    spec: 'CUDA'
+    msagent: true
+
+- template: templates/linux_build_task.yml
+  parameters:
+    msagent: $(ms.hosted.agent.cpu)
diff --git a/packaging/windows/build_csprng.bat b/packaging/windows/build_csprng.bat
new file mode 100644
index 0000000..e6da23d
--- /dev/null
+++ b/packaging/windows/build_csprng.bat
@@ -0,0 +1,145 @@
+@echo off
+
+:: This script parses args, installs required libraries (miniconda, MKL,
+:: Magma), and then delegates to cpu.bat, cuda80.bat, etc.
+
+IF NOT "%CUDA_VERSION%" == "" IF NOT "%TORCHCSPRNG_BUILD_VERSION%" == "" if NOT "%TORCHCSPRNG_BUILD_NUMBER%" == "" goto env_end
+if "%~1"=="" goto arg_error
+if "%~2"=="" goto arg_error
+if "%~3"=="" goto arg_error
+if NOT "%~4"=="" goto arg_error
+goto arg_end
+
+:arg_error
+
+echo Illegal number of parameters. Pass cuda version, pytorch version, build number
+echo CUDA version should be Mm with no dot, e.g. '80'
+echo DESIRED_PYTHON should be M.m, e.g. '2.7'
+exit /b 1
+
+:arg_end
+
+set CUDA_VERSION=%~1
+set TORCHCSPRNG_BUILD_VERSION=%~2
+set TORCHCSPRNG_BUILD_NUMBER=%~3
+
+set BUILD_VERSION=%TORCHCSPRNG_BUILD_VERSION%
+
+:env_end
+
+if NOT "%CUDA_VERSION%" == "cpu" (
+    set CUDA_PREFIX=cuda%CUDA_VERSION%
+    set CUVER=cu%CUDA_VERSION%
+    set FORCE_CUDA=1
+) else (
+    set CUDA_PREFIX=cpu
+    set CUVER=cpu
+)
+
+set BUILD_CSPRNG=1
+REM set TORCH_WHEEL=torch -f https://download.pytorch.org/whl/%CUVER%/stable.html --no-index
+
+IF "%DESIRED_PYTHON%" == "" set DESIRED_PYTHON=3.5;3.6;3.7
+set DESIRED_PYTHON_PREFIX=%DESIRED_PYTHON:.=%
+set DESIRED_PYTHON_PREFIX=py%DESIRED_PYTHON_PREFIX:;=;py%
+
+set SRC_DIR=%~dp0
+pushd %SRC_DIR%
+
+:: Install Miniconda3
+set "CONDA_HOME=%CD%\conda"
+set "tmp_conda=%CONDA_HOME%"
+set "miniconda_exe=%CD%\miniconda.exe"
+rmdir /s /q conda
+del miniconda.exe
+curl -k https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%"
+call ..\conda\install_conda.bat
+IF ERRORLEVEL 1 exit /b 1
+set "ORIG_PATH=%PATH%"
+set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%"
+
+:: Create a new conda environment
+setlocal EnableDelayedExpansion
+FOR %%v IN (%DESIRED_PYTHON%) DO (
+    set PYTHON_VERSION_STR=%%v
+    set PYTHON_VERSION_STR=!PYTHON_VERSION_STR:.=!
+    conda remove -n py!PYTHON_VERSION_STR! --all -y || rmdir %CONDA_HOME%\envs\py!PYTHON_VERSION_STR! /s
+    conda create -n py!PYTHON_VERSION_STR! -y -q -c defaults -c conda-forge numpy>=1.11 mkl>=2018 python=%%v ca-certificates scipy pycrypto
+)
+
+:: Uncomment for stable releases
+:: FOR %%v IN (%DESIRED_PYTHON%) DO (
+::     set PYTHON_VERSION_STR=%%v
+::     set PYTHON_VERSION_STR=!PYTHON_VERSION_STR:.=!
+::     set "PATH=%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!;%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!\scripts;%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!\Library\bin;%ORIG_PATH%"
+
+::     if "%CUDA_VERSION%" == "100" (
+::         set TORCH_WHEEL=https://download.pytorch.org/whl/%CUVER%/torch-1.2.0-cp!PYTHON_VERSION_STR!-cp!PYTHON_VERSION_STR!m-win_amd64.whl
+::     ) else (
+::         set TORCH_WHEEL=https://download.pytorch.org/whl/%CUVER%/torch-1.2.0%%2B%CUVER%-cp!PYTHON_VERSION_STR!-cp!PYTHON_VERSION_STR!m-win_amd64.whl
+::     )
+::     echo Installing !TORCH_WHEEL!...
+::     pip install "!TORCH_WHEEL!"
+:: )
+
+:: Uncomment for nightly releases
+FOR %%v IN (%DESIRED_PYTHON%) DO (
+    set PYTHON_VERSION_STR=%%v
+    set PYTHON_VERSION_STR=!PYTHON_VERSION_STR:.=!
+    set "PATH=%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!;%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!\scripts;%CONDA_HOME%\envs\py!PYTHON_VERSION_STR!\Library\bin;%ORIG_PATH%"
+
+    set TORCH_WHEEL=torch --pre -f https://download.pytorch.org/whl/nightly/%CUVER%/torch_nightly.html
+    echo Installing !TORCH_WHEEL!...
+    pip install !TORCH_WHEEL!
+)
+
+endlocal
+
+if "%DEBUG%" == "1" (
+    set BUILD_TYPE=debug
+) ELSE (
+    set BUILD_TYPE=release
+)
+
+:: Install sccache
+if "%USE_SCCACHE%" == "1" (
+    mkdir %CD%\tmp_bin
+    curl -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %CD%\tmp_bin\sccache.exe
+    if not "%CUDA_VERSION%" == "" (
+        copy %CD%\tmp_bin\sccache.exe %CD%\tmp_bin\nvcc.exe
+
+        set CUDA_NVCC_EXECUTABLE=%CD%\tmp_bin\nvcc
+        set "PATH=%CD%\tmp_bin;%PATH%"
+    )
+)
+
+for %%v in (%DESIRED_PYTHON_PREFIX%) do (
+    :: Activate Python Environment
+    set PYTHON_PREFIX=%%v
+    set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
+    if defined INCLUDE (
+        set "INCLUDE=%INCLUDE%;%CONDA_HOME%\envs\%%v\Library\include"
+    ) else (
+        set "INCLUDE=%CONDA_HOME%\envs\%%v\Library\include"
+    )
+    if defined LIB (
+        set "LIB=%LIB%;%CONDA_HOME%\envs\%%v\Library\lib"
+    ) else (
+        set "LIB=%CONDA_HOME%\envs\%%v\Library\lib"
+    )
+    @setlocal
+    :: Set Flags
+    if NOT "%CUDA_VERSION%"=="cpu" (
+        set CUDNN_VERSION=7
+    )
+    call %CUDA_PREFIX%.bat
+    IF ERRORLEVEL 1 exit /b 1
+    call internal\test.bat
+    IF ERRORLEVEL 1 exit /b 1
+    @endlocal
+)
+
+set "PATH=%ORIG_PATH%"
+popd
+
+IF ERRORLEVEL 1 exit /b 1
diff --git a/packaging/windows/cpu.bat b/packaging/windows/cpu.bat
new file mode 100644
index 0000000..1897fb5
--- /dev/null
+++ b/packaging/windows/cpu.bat
@@ -0,0 +1,37 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+echo Disabling CUDA
+set NO_CUDA=1
+set USE_CUDA=0
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy_cpu.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/cuda101.bat b/packaging/windows/cuda101.bat
new file mode 100644
index 0000000..016baec
--- /dev/null
+++ b/packaging/windows/cuda101.bat
@@ -0,0 +1,59 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+set NO_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+    exit /b 1
+    goto optcheck
+)
+
+IF "%CUDA_PATH_V10_1%"=="" (
+    echo CUDA 10.1 not found, failing
+    exit /b 1
+) ELSE (
+    IF "%BUILD_CSPRNG%" == "" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX;6.0;6.1;7.0;7.5
+        set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+    ) ELSE (
+        set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+    )
+
+    set "CUDA_PATH=%CUDA_PATH_V10_1%"
+    set "PATH=%CUDA_PATH_V10_1%\bin;%PATH%"
+)
+
+:optcheck
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/cuda102.bat b/packaging/windows/cuda102.bat
new file mode 100644
index 0000000..d5a0bdf
--- /dev/null
+++ b/packaging/windows/cuda102.bat
@@ -0,0 +1,59 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+set NO_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+    exit /b 1
+    goto optcheck
+)
+
+IF "%CUDA_PATH_V10_2%"=="" (
+    echo CUDA 10.2 not found, failing
+    exit /b 1
+) ELSE (
+    IF "%BUILD_CSPRNG%" == "" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX;6.0;6.1;7.0;7.5
+        set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+    ) ELSE (
+        set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+    )
+
+    set "CUDA_PATH=%CUDA_PATH_V10_2%"
+    set "PATH=%CUDA_PATH_V10_2%\bin;%PATH%"
+)
+
+:optcheck
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/cuda92.bat b/packaging/windows/cuda92.bat
new file mode 100644
index 0000000..7f520da
--- /dev/null
+++ b/packaging/windows/cuda92.bat
@@ -0,0 +1,59 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+set USE_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+    exit /b 1
+    goto optcheck
+)
+
+IF "%CUDA_PATH_V9_2%"=="" (
+    echo CUDA 9.2 not found, failing
+    exit /b 1
+) ELSE (
+    IF "%BUILD_CSPRNG%" == "" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX;6.0;6.1;7.0
+        set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+    ) ELSE (
+        set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50
+    )
+
+    set "CUDA_PATH=%CUDA_PATH_V9_2%"
+    set "PATH=%CUDA_PATH_V9_2%\bin;%PATH%"
+)
+
+:optcheck
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/internal/auth.bat b/packaging/windows/internal/auth.bat
new file mode 100644
index 0000000..c874bce
--- /dev/null
+++ b/packaging/windows/internal/auth.bat
@@ -0,0 +1,46 @@
+@echo off
+
+: From the following doc, the build won't be triggered if the users don't sign in daily.
+: https://docs.microsoft.com/en-us/azure/devops/pipelines/build/triggers?tabs=yaml&view=vsts#my-build-didnt-run-what-happened
+: To avoid this problem, we can just go through the sign in process using the following command.
+
+:auth_start
+
+if "%RETRY_TIMES%" == "" (
+    set /a RETRY_TIMES=10
+    set /a SLEEP_TIME=2
+) else (
+    set /a RETRY_TIMES=%RETRY_TIMES%-1
+    set /a SLEEP_TIME=%SLEEP_TIME%*2
+)
+
+for /f "usebackq tokens=*" %%i in (`curl -so NUL -w "%%{http_code}" -u %VSTS_AUTH% https://dev.azure.com/pytorch`) do (
+    set STATUS_CODE=%%i
+)
+
+IF NOT "%STATUS_CODE%" == "200" (
+    echo Auth retry times remaining: %RETRY_TIMES%
+    echo Sleep time: %SLEEP_TIME% seconds
+    IF %RETRY_TIMES% EQU 0 (
+        echo Auth failed
+        goto err
+    )
+    waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul
+    goto auth_start
+) ELSE (
+    echo Login Attempt Succeeded
+    goto auth_end
+)
+
+:err
+
+: Throw a warning if it fails
+powershell -c "Write-Warning 'Login Attempt Failed'"
+
+:auth_end
+
+set RETRY_TIMES=
+set SLEEP_TIME=
+set STATUS_CODE=
+
+exit /b 0
diff --git a/packaging/windows/internal/build_conda.bat b/packaging/windows/internal/build_conda.bat
new file mode 100644
index 0000000..6ffd67b
--- /dev/null
+++ b/packaging/windows/internal/build_conda.bat
@@ -0,0 +1,15 @@
+if "%VC_YEAR%" == "2017" set VSDEVCMD_ARGS=-vcvars_ver=14.13
+if "%VC_YEAR%" == "2017" powershell packaging/windows/internal/vs2017_install.ps1
+if errorlevel 1 exit /b 1
+
+call packaging/windows/internal/cuda_install.bat
+if errorlevel 1 exit /b 1
+
+call packaging/windows/internal/nightly_defaults.bat Conda
+if errorlevel 1 exit /b 1
+
+set PYTORCH_FINAL_PACKAGE_DIR=%CD%\packaging\windows\output
+if not exist "%PYTORCH_FINAL_PACKAGE_DIR%" mkdir %PYTORCH_FINAL_PACKAGE_DIR%
+
+bash ./packaging/conda/build_csprng.sh %CUDA_VERSION% %TORCHCSPRNG_BUILD_VERSION% %TORCHCSPRNG_BUILD_NUMBER%
+if errorlevel 1 exit /b 1
diff --git a/packaging/windows/internal/build_wheels.bat b/packaging/windows/internal/build_wheels.bat
new file mode 100644
index 0000000..876b8b0
--- /dev/null
+++ b/packaging/windows/internal/build_wheels.bat
@@ -0,0 +1,12 @@
+if "%VC_YEAR%" == "2017" set VSDEVCMD_ARGS=-vcvars_ver=14.13
+if "%VC_YEAR%" == "2017" powershell packaging/windows/internal/vs2017_install.ps1
+if errorlevel 1 exit /b 1
+
+call packaging/windows/internal/cuda_install.bat
+if errorlevel 1 exit /b 1
+
+call packaging/windows/internal/nightly_defaults.bat Wheels
+if errorlevel 1 exit /b 1
+
+call packaging/windows/build_csprng.bat %CUDA_VERSION% %TORCHCSPRNG_BUILD_VERSION% %TORCHCSPRNG_BUILD_NUMBER%
+if errorlevel 1 exit /b 1
diff --git a/packaging/windows/internal/check_deps.bat b/packaging/windows/internal/check_deps.bat
new file mode 100644
index 0000000..739e568
--- /dev/null
+++ b/packaging/windows/internal/check_deps.bat
@@ -0,0 +1,67 @@
+@echo off
+
+REM Check for necessary components
+
+IF NOT "%PROCESSOR_ARCHITECTURE%"=="AMD64" (
+    echo You should use 64 bits Windows to build and run PyTorch
+    exit /b 1
+)
+
+IF "%BUILD_CSPRNG%" == "" (
+    where /q cmake.exe
+
+    IF ERRORLEVEL 1 (
+        echo CMake is required to compile PyTorch on Windows
+        exit /b 1
+    )
+)
+
+IF NOT EXIST "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+    echo Visual Studio 2017 C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+IF "%VS15VCVARSALL%"=="" (
+    echo Visual Studio 2017 C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+set MSSdk=1
+set DISTUTILS_USE_SDK=1
+
+where /q python.exe
+
+IF ERRORLEVEL 1 (
+    echo Python x64 3.5 or up is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+for /F "usebackq delims=" %%i in (`python -c "import sys; print('{0[0]}{0[1]}'.format(sys.version_info))"`) do (
+    set /a PYVER=%%i
+)
+
+if  %PYVER% LSS 35 (
+    echo Warning: PyTorch for Python 2 under Windows is experimental.
+    echo Python x64 3.5 or up is recommended to compile PyTorch on Windows
+    echo Maybe you can create a virual environment if you have conda installed:
+    echo ^> conda create -n test python=3.6 pyyaml mkl numpy
+    echo ^> activate test
+)
+
+for /F "usebackq delims=" %%i in (`python -c "import struct;print( 8 * struct.calcsize('P'))"`) do (
+    set /a PYSIZE=%%i
+)
+
+if %PYSIZE% NEQ 64 (
+    echo Python x64 3.5 or up is required to compile PyTorch on Windows
+    exit /b 1
+)
diff --git a/packaging/windows/internal/check_opts.bat b/packaging/windows/internal/check_opts.bat
new file mode 100644
index 0000000..003ad92
--- /dev/null
+++ b/packaging/windows/internal/check_opts.bat
@@ -0,0 +1,33 @@
+@echo off
+
+REM Check for optional components
+
+where /q ninja.exe
+
+IF NOT ERRORLEVEL 1 (
+    echo Ninja found, using it to speed up builds
+    set CMAKE_GENERATOR=Ninja
+)
+
+where /q clcache.exe
+
+IF NOT ERRORLEVEL 1 (
+    echo clcache found, using it to speed up builds
+    set CC=clcache
+    set CXX=clcache
+)
+
+where /q sccache.exe
+
+IF NOT ERRORLEVEL 1 (
+    echo sccache found, using it to speed up builds
+    set CC=sccache cl
+    set CXX=sccache cl
+)
+
+IF exist "%MKLProductDir%\mkl\lib\intel64_win" (
+    echo MKL found, adding it to build
+    set "LIB=%MKLProductDir%\mkl\lib\intel64_win;%MKLProductDir%\compiler\lib\intel64_win;%LIB%";
+)
+
+exit /b 0
diff --git a/packaging/windows/internal/clean.bat b/packaging/windows/internal/clean.bat
new file mode 100644
index 0000000..7489640
--- /dev/null
+++ b/packaging/windows/internal/clean.bat
@@ -0,0 +1,5 @@
+@echo off
+
+cd %MODULE_NAME%
+python setup.py clean
+cd ..
diff --git a/packaging/windows/internal/clone.bat b/packaging/windows/internal/clone.bat
new file mode 100644
index 0000000..758527c
--- /dev/null
+++ b/packaging/windows/internal/clone.bat
@@ -0,0 +1,56 @@
+@echo off
+
+:: The conda and wheels jobs are seperated on Windows, so we don't need to clone again.
+IF "%BUILD_CSPRNG%" == "" (
+    if exist "%NIGHTLIES_PYTORCH_ROOT%" (
+        xcopy /E /Y /Q "%NIGHTLIES_PYTORCH_ROOT%" pytorch\
+        cd pytorch
+        goto submodule
+    )
+)
+
+git clone https://github.com/%PYTORCH_REPO%/%MODULE_NAME%
+
+cd %MODULE_NAME%
+
+IF NOT "%BUILD_CSPRNG%" == "" goto latest_end
+
+IF "%PYTORCH_BRANCH%" == "latest" ( goto latest_start ) else ( goto latest_end )
+
+:latest_start
+
+if "%NIGHTLIES_DATE%" == "" ( goto date_start ) else ( goto date_end )
+
+:date_start
+
+set "DATE_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyy_MM_dd'"
+set "DATE_COMPACT_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyyMMdd'"
+
+FOR /F "delims=" %%i IN ('powershell -c "%DATE_CMD%"') DO set NIGHTLIES_DATE=%%i
+FOR /F "delims=" %%i IN ('powershell -c "%DATE_COMPACT_CMD%"') DO set NIGHTLIES_DATE_COMPACT=%%i
+
+:date_end
+
+if "%NIGHTLIES_DATE_COMPACT%" == "" set NIGHTLIES_DATE_COMPACT=%NIGHTLIES_DATE:~0,4%%NIGHTLIES_DATE:~5,2%%NIGHTLIES_DATE:~8,2%
+
+:: Switch to the latest commit by 11:59 yesterday
+echo PYTORCH_BRANCH is set to latest so I will find the last commit
+echo before 0:00 midnight on %NIGHTLIES_DATE%
+set git_date=%NIGHTLIES_DATE:_=-%
+FOR /F "delims=" %%i IN ('git log --before %git_date% -n 1 "--pretty=%%H"') DO set last_commit=%%i
+echo Setting PYTORCH_BRANCH to %last_commit% since that was the last
+echo commit before %NIGHTLIES_DATE%
+set PYTORCH_BRANCH=%last_commit%
+
+:latest_end
+
+IF "%PYTORCH_BRANCH%" == "" (
+    set PYTORCH_BRANCH=v%TORCHCSPRNG_BUILD_VERSION%
+)
+git checkout %PYTORCH_BRANCH%
+IF ERRORLEVEL 1 git checkout tags/%PYTORCH_BRANCH%
+
+:submodule
+
+git submodule update --init --recursive
+IF ERRORLEVEL 1 exit /b 1
diff --git a/packaging/windows/internal/copy.bat b/packaging/windows/internal/copy.bat
new file mode 100644
index 0000000..b4aa397
--- /dev/null
+++ b/packaging/windows/internal/copy.bat
@@ -0,0 +1,13 @@
+copy "%CUDA_PATH%\bin\cusparse64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cublas64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cudart64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\curand64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cufft64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\cufftw64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+
+copy "%CUDA_PATH%\bin\cudnn64_%CUDNN_VERSION%.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\nvrtc64_%CUDA_VERSION%*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\bin\nvrtc-builtins64_%CUDA_VERSION%.dll*" pytorch\torch\lib
+
+copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
diff --git a/packaging/windows/internal/copy_cpu.bat b/packaging/windows/internal/copy_cpu.bat
new file mode 100644
index 0000000..f5b9d11
--- /dev/null
+++ b/packaging/windows/internal/copy_cpu.bat
@@ -0,0 +1 @@
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
new file mode 100644
index 0000000..9ca08e1
--- /dev/null
+++ b/packaging/windows/internal/cuda_install.bat
@@ -0,0 +1,201 @@
+@echo on
+
+if "%CU_VERSION%" == "cpu" (
+    echo Skipping for CPU builds
+    exit /b 0
+)
+
+set SRC_DIR=%~dp0\..
+
+if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
+
+set /a CUDA_VER=%CU_VERSION:cu=%
+set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
+set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
+set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
+
+if %CUDA_VER% EQU 92 goto cuda92
+if %CUDA_VER% EQU 100 goto cuda100
+if %CUDA_VER% EQU 101 goto cuda101
+if %CUDA_VER% EQU 102 goto cuda102
+if %CUDA_VER% EQU 110 goto cuda110
+if %CUDA_VER% EQU 111 goto cuda111
+if %CUDA_VER% EQU 112 goto cuda112
+
+echo CUDA %CUDA_VERSION_STR% is not supported
+exit /b 1
+
+:cuda92
+if not exist "%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cuda_9.2.148_win10.exe --output "%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_9.2.148_win10.exe"
+    set "ARGS=nvcc_9.2 cuobjdump_9.2 nvprune_9.2 cupti_9.2 cublas_9.2 cublas_dev_9.2 cudart_9.2 cufft_9.2 cufft_dev_9.2 curand_9.2 curand_dev_9.2 cusolver_9.2 cusolver_dev_9.2 cusparse_9.2 cusparse_dev_9.2 nvgraph_9.2 nvgraph_dev_9.2 npp_9.2 npp_dev_9.2 nvrtc_9.2 nvrtc_dev_9.2 nvml_dev_9.2"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cudnn-9.2-windows10-x64-v7.2.1.38.zip --output "%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-9.2-windows10-x64-v7.2.1.38.zip"
+)
+
+goto cuda_common
+
+:cuda100
+
+if not exist "%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cuda_10.0.130_411.31_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.0.130_411.31_win10.exe"
+    set "ARGS=nvcc_10.0 cuobjdump_10.0 nvprune_10.0 cupti_10.0 cublas_10.0 cublas_dev_10.0 cudart_10.0 cufft_10.0 cufft_dev_10.0 curand_10.0 curand_dev_10.0 cusolver_10.0 cusolver_dev_10.0 cusparse_10.0 cusparse_dev_10.0 nvgraph_10.0 nvgraph_dev_10.0 npp_10.0 npp_dev_10.0 nvrtc_10.0 nvrtc_dev_10.0 nvml_dev_10.0"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/win2016/cudnn-10.0-windows10-x64-v7.4.1.5.zip --output "%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.0-windows10-x64-v7.4.1.5.zip"
+)
+
+goto cuda_common
+
+:cuda101
+
+if not exist "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.1.243_426.00_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
+    set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.1-windows10-x64-v7.6.4.38.zip --output "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip"
+)
+
+goto cuda_common
+
+:cuda102
+
+if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
+    set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.2-windows10-x64-v7.6.5.32.zip --output "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
+)
+
+goto cuda_common
+
+:cuda110
+
+if not exist "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.0.2_451.48_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
+    set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.0-windows-x64-v8.0.4.30.zip --output "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip"
+)
+
+goto cuda_common
+
+:cuda111
+
+if not exist "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.1.0_456.43_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
+    set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
+)
+
+@REM There is no downloadable driver for Tesla on CUDA 11.1 yet. We will use
+@REM the driver inside CUDA
+if "%JOB_EXECUTOR%" == "windows-with-nvidia-gpu" set "ARGS=%ARGS% Display.Driver"
+
+if not exist "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.1-windows-x64-v8.0.5.39.zip --output "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
+)
+
+goto cuda_common
+
+:cuda112
+
+if not exist "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe" (
+    curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.2.0_460.89_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
+    set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
+)
+
+if not exist "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip" (
+    curl -k -L http://s3.amazonaws.com/ossci-windows/cudnn-11.2-windows-x64-v8.1.0.77.zip --output "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip"
+)
+
+goto cuda_common
+
+:cuda_common
+
+if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
+    curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
+    if errorlevel 1 exit /b 1
+)
+
+if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
+    curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
+    if errorlevel 1 exit /b 1
+)
+
+echo Installing CUDA toolkit...
+7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
+pushd "%SRC_DIR%\temp_build\cuda"
+start /wait setup.exe -s %ARGS%
+popd
+
+echo Installing VS integration...
+xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\Common7\IDE\VC\VCTargets\BuildCustomizations"
+
+echo Installing NvToolsExt...
+7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
+mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+
+echo Setting up environment...
+set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
+set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+
+if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+    echo CUDA %CUDA_VERSION_STR% installed failed.
+    exit /b 1
+)
+
+echo Installing cuDNN...
+7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\lib\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
+xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
+
+echo Installing GPU driver DLLs
+7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -o"C:\Windows\System32"
+
+echo Cleaning temp files
+rd /s /q "%SRC_DIR%\temp_build" || ver > nul
diff --git a/packaging/windows/internal/dep_install.bat b/packaging/windows/internal/dep_install.bat
new file mode 100644
index 0000000..db665a9
--- /dev/null
+++ b/packaging/windows/internal/dep_install.bat
@@ -0,0 +1,14 @@
+@echo off
+
+REM curl -k https://www.7-zip.org/a/7z1805-x64.exe -O
+REM if errorlevel 1 exit /b 1
+
+REM start /wait 7z1805-x64.exe /S
+REM if errorlevel 1 exit /b 1
+
+REM set "PATH=%ProgramFiles%\7-Zip;%PATH%"
+
+choco feature disable --name showDownloadProgress
+choco feature enable --name allowGlobalConfirmation
+
+choco install curl 7zip
diff --git a/packaging/windows/internal/env_fix.bat b/packaging/windows/internal/env_fix.bat
new file mode 100644
index 0000000..dd0aaf5
--- /dev/null
+++ b/packaging/windows/internal/env_fix.bat
@@ -0,0 +1,31 @@
+@echo off
+
+:: Caution: Please don't use this script locally
+:: It may destroy your build environment.
+
+setlocal
+
+IF NOT EXIST "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+    echo Visual Studio 2017 C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+
+IF "%VS15VCVARSALL%"=="" (
+    echo Visual Studio 2017 C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+call "%VS15VCVARSALL%" x86_amd64
+for /f "usebackq tokens=*" %%i in (`where link.exe`) do move "%%i" "%%i.bak"
+
+endlocal
diff --git a/packaging/windows/internal/nightly_defaults.bat b/packaging/windows/internal/nightly_defaults.bat
new file mode 100644
index 0000000..2b5ca5c
--- /dev/null
+++ b/packaging/windows/internal/nightly_defaults.bat
@@ -0,0 +1,200 @@
+@echo on
+
+if "%~1"=="" goto arg_error
+if NOT "%~2"=="" goto arg_error
+goto arg_end
+
+:arg_error
+
+echo Illegal number of parameters. Pass packge type `Conda` or `Wheels`.
+exit /b 1
+
+:arg_end
+
+echo "nightly_defaults.bat at %CD% starting at %DATE%"
+
+set SRC_DIR=%~dp0\..
+
+:: NIGHTLIES_FOLDER
+:: N.B. this is also defined in cron_start.sh
+::   An arbitrary root folder to store all nightlies folders, each of which is a
+::   parent level date folder with separate subdirs for logs, wheels, conda
+::   packages, etc. This should be kept the same across all scripts called in a
+::   cron job, so it only has a default value in the top-most script
+::   build_cron.sh to avoid the default values from diverging.
+if "%NIGHTLIES_FOLDER%" == "" set "NIGHTLIES_FOLDER=%SRC_DIR%"
+
+:: NIGHTLIES_DATE
+:: N.B. this is also defined in cron_start.sh
+::   The date in YYYY_mm_dd format that we are building for. If this is not
+::   already set, then this will first try to find the date of the nightlies
+::   folder that this builder repo exists in; e.g. if this script exists in
+::   some_dir/2019_09_04/builder/cron/ then this will be set to 2019_09_04 (must
+::   match YYYY_mm_dd). This is for convenience when debugging/uploading past
+::   dates, so that you don't have to set NIGHTLIES_DATE yourself. If a date
+::   folder cannot be found in that exact location, then this will default to
+::   the current date.
+
+
+if "%NIGHTLIES_DATE%" == "" ( goto date_start ) else ( goto date_end )
+
+:date_start
+
+set "DATE_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyy_MM_dd'"
+set "DATE_COMPACT_CMD=Get-Date ([System.TimeZoneInfo]::ConvertTimeFromUtc((Get-Date).ToUniversalTime(), [System.TimeZoneInfo]::FindSystemTimeZoneById('Pacific Standard Time'))) -f 'yyyyMMdd'"
+
+FOR /F "delims=" %%i IN ('powershell -c "%DATE_CMD%"') DO set NIGHTLIES_DATE=%%i
+FOR /F "delims=" %%i IN ('powershell -c "%DATE_COMPACT_CMD%"') DO set NIGHTLIES_DATE_COMPACT=%%i
+
+:date_end
+
+if "%NIGHTLIES_DATE_COMPACT%" == "" set NIGHTLIES_DATE_COMPACT=%NIGHTLIES_DATE:~0,4%%NIGHTLIES_DATE:~5,2%%NIGHTLIES_DATE:~8,2%
+
+:: Used in lots of places as the root dir to store all conda/wheel/manywheel
+:: packages as well as logs for the day
+set today=%NIGHTLIES_FOLDER%\%NIGHTLIES_DATE%
+mkdir "%today%" || ver >nul
+
+
+::#############################################################################
+:: Add new configuration variables below this line. 'today' should always be
+:: defined ASAP to avoid weird errors
+::#############################################################################
+
+
+:: List of people to email when things go wrong. This is passed directly to
+:: `mail -t`
+:: TODO: Not supported yet
+if "%NIGHTLIES_EMAIL_LIST%" == "" set NIGHTLIES_EMAIL_LIST=peterghost86@gmail.com
+
+:: PYTORCH_CREDENTIALS_FILE
+::   A bash file that exports credentials needed to upload to aws and anaconda.
+::   Needed variables are PYTORCH_ANACONDA_USERNAME, PYTORCH_ANACONDA_PASSWORD,
+::   AWS_ACCESS_KEY_ID, and AWS_SECRET_ACCESS_KEY. Or it can just export the AWS
+::   keys and then prepend a logged-in conda installation to the path.
+:: TODO: Not supported yet
+if "%PYTORCH_CREDENTIALS_FILE%" == "" set PYTORCH_CREDENTIALS_FILE=/c/Users/administrator/nightlies/credentials.sh
+
+:: Location of the temporary miniconda that is downloaded to install conda-build
+:: and aws to upload finished packages TODO this is messy to install this in
+:: upload.sh and later use it in upload_logs.sh
+if "%CONDA_UPLOADER_INSTALLATION%" == "" set "CONDA_UPLOADER_INSTALLATION=%today%\miniconda"
+
+:: N.B. BUILDER_REPO and BUILDER_BRANCH are both set in cron_start.sh, as that
+:: is the script that actually clones the builder repo that /this/ script is
+:: running from.
+pushd "%SRC_DIR%\.."
+set NIGHTLIES_BUILDER_ROOT=%CD%
+popd
+
+:: The shared pytorch repo to be used by all builds
+if "%NIGHTLIES_PYTORCH_ROOT%" == "" set "NIGHTLIES_PYTORCH_ROOT=%today%\csprng"
+
+:: PYTORCH_REPO
+::   The Github org/user whose fork of Pytorch to check out (git clone
+::   https://github.com/<THIS_PART>/pytorch.git). This will always be cloned
+::   fresh to build with. Default is 'pytorch'
+if "%PYTORCH_REPO%" == "" set PYTORCH_REPO=pytorch
+
+:: PYTORCH_BRANCH
+::   The branch of Pytorch to checkout for building (git checkout <THIS_PART>).
+::   This can either be the name of the branch (e.g. git checkout
+::   my_branch_name) or can be a git commit (git checkout 4b2674n...). Default
+::   is 'latest', which is a special term that signals to pull the last commit
+::   before 0:00 midnight on the NIGHTLIES_DATE
+if "%PYTORCH_BRANCH%" == "" set PYTORCH_BRANCH=nightly
+
+:: Clone the requested pytorch checkout
+if exist "%NIGHTLIES_PYTORCH_ROOT%" ( goto clone_end ) else ( goto clone_start )
+
+:clone_start
+
+git clone --recursive "https://github.com/%PYTORCH_REPO%/csprng.git" "%NIGHTLIES_PYTORCH_ROOT%"
+pushd "%NIGHTLIES_PYTORCH_ROOT%"
+
+if "%PYTORCH_BRANCH%" == "latest" ( goto latest_start ) else ( goto latest_end )
+
+:latest_start
+
+:: Switch to the latest commit by 11:59 yesterday
+echo PYTORCH_BRANCH is set to latest so I will find the last commit
+echo before 0:00 midnight on %NIGHTLIES_DATE%
+set git_date=%NIGHTLIES_DATE:_=-%
+FOR /F "delims=" %%i IN ('git log --before %git_date% -n 1 "--pretty=%%H"') DO set last_commit=%%i
+echo Setting PYTORCH_BRANCH to %last_commit% since that was the last
+echo commit before %NIGHTLIES_DATE%
+set PYTORCH_BRANCH=%last_commit%
+
+:latest_end
+
+git checkout "%PYTORCH_BRANCH%"
+git submodule update
+popd
+
+:clone_end
+
+if "%CUDA_VERSION%" == "cpu" (
+    set _DESIRED_CUDA=cpu
+) else (
+    set _DESIRED_CUDA=cu%CUDA_VERSION%
+)
+
+:: PYTORCH_BUILD_VERSION
+::   The actual version string. Used in conda like
+::       pytorch-nightly==1.0.0.dev20180908
+::   or in manylinux like
+::       torch_nightly-1.0.0.dev20180908-cp27-cp27m-linux_x86_64.whl
+if "%TORCHCSPRNG_BUILD_VERSION%" == "" set TORCHCSPRNG_BUILD_VERSION=0.9.0.dev%NIGHTLIES_DATE_COMPACT%
+
+if "%~1" == "Wheels" (
+    if not "%CUDA_VERSION%" == "102" (
+        set TORCHCSPRNG_BUILD_VERSION=%TORCHCSPRNG_BUILD_VERSION%+%_DESIRED_CUDA%
+    )
+)
+
+:: PYTORCH_BUILD_NUMBER
+::   This is usually the number 1. If more than one build is uploaded for the
+::   same version/date, then this can be incremented to 2,3 etc in which case
+::   '.post2' will be appended to the version string of the package. This can
+::   be set to '0' only if OVERRIDE_PACKAGE_VERSION is being used to bypass
+::   all the version string logic in downstream scripts. Since we use the
+::   override below, exporting this shouldn't actually matter.
+if "%TORCHCSPRNG_BUILD_NUMBER%" == "" set /a TORCHCSPRNG_BUILD_NUMBER=1
+if %TORCHCSPRNG_BUILD_NUMBER% GTR 1 set TORCHCSPRNG_BUILD_VERSION=%TORCHCSPRNG_BUILD_VERSION%%TORCHCSPRNG_BUILD_NUMBER%
+
+:: The nightly builds use their own versioning logic, so we override whatever
+:: logic is in setup.py or other scripts
+:: TODO: Not supported yet
+set OVERRIDE_PACKAGE_VERSION=%TORCHCSPRNG_BUILD_VERSION%
+set BUILD_VERSION=%TORCHCSPRNG_BUILD_VERSION%
+
+:: Build folder for conda builds to use
+if "%TORCH_CONDA_BUILD_FOLDER%" == "" set TORCH_CONDA_BUILD_FOLDER=torchcsprng
+
+:: TORCH_PACKAGE_NAME
+::   The name of the package to upload. This should probably be pytorch or
+::   pytorch-nightly. N.B. that pip will change all '-' to '_' but conda will
+::   not. This is dealt with in downstream scripts.
+:: TODO: Not supported yet
+if "%TORCH_PACKAGE_NAME%" == "" set TORCH_PACKAGE_NAME=torchcsprng
+
+:: PIP_UPLOAD_FOLDER should end in a slash. This is to handle it being empty
+:: (when uploading to e.g. whl/cpu/) and also to handle nightlies (when
+:: uploading to e.g. /whl/nightly/cpu)
+:: TODO: Not supported yet
+if "%PIP_UPLOAD_FOLDER%" == "" set "PIP_UPLOAD_FOLDER=nightly\"
+
+:: The location of the binary_sizes dir in s3 is hardcoded into
+:: upload_binary_sizes.sh
+
+:: DAYS_TO_KEEP
+::   How many days to keep around for clean.sh. Build folders older than this
+::   will be purged at the end of cron jobs. '1' means to keep only the current
+::   day. Values less than 1 are not allowed. The default is 5.
+:: TODO: Not supported yet
+if "%DAYS_TO_KEEP%" == "" set /a DAYS_TO_KEEP=5
+if %DAYS_TO_KEEP% LSS 1 (
+    echo DAYS_TO_KEEP cannot be less than 1.
+    echo A value of 1 means to only keep the build for today
+    exit /b 1
+)
diff --git a/packaging/windows/internal/publish.bat b/packaging/windows/internal/publish.bat
new file mode 100644
index 0000000..7e820d7
--- /dev/null
+++ b/packaging/windows/internal/publish.bat
@@ -0,0 +1,89 @@
+@echo off
+
+set SRC_DIR=%~dp0
+pushd %SRC_DIR%
+
+if NOT "%CUDA_VERSION%" == "cpu" (
+    set PACKAGE_SUFFIX=_cuda%CUDA_VERSION%
+) else (
+    set PACKAGE_SUFFIX=
+)
+
+if "%PACKAGEFULLNAME%" == "Conda" (
+    set PACKAGE=conda
+) else (
+    set PACKAGE=wheels
+)
+
+if not defined PACKAGE_SUFFIX (
+    set PUBLISH_BRANCH=csprng_%PACKAGE%_%DESIRED_PYTHON%
+) else (
+    set PUBLISH_BRANCH=csprng_%PACKAGE%_%DESIRED_PYTHON%%PACKAGE_SUFFIX%
+)
+
+git clone %ARTIFACT_REPO_URL% -b %PUBLISH_BRANCH% --single-branch >nul 2>&1
+
+IF ERRORLEVEL 1 (
+    echo Branch %PUBLISH_BRANCH% not exist, falling back to master
+    set NO_BRANCH=1
+    git clone %ARTIFACT_REPO_URL% -b master --single-branch >nul 2>&1
+)
+
+IF ERRORLEVEL 1 (
+    echo Clone failed
+    goto err
+)
+
+cd pytorch_builder
+attrib -s -h -r . /s /d
+
+:: Empty repo
+rd /s /q . || ver >nul
+
+IF NOT EXIST %PACKAGE% mkdir %PACKAGE%
+
+xcopy /S /E /Y ..\..\output\*.* %PACKAGE%\
+
+git config --global user.name "Azure DevOps"
+git config --global user.email peterghost86@gmail.com
+git init
+git checkout --orphan %PUBLISH_BRANCH%
+git remote add origin %ARTIFACT_REPO_URL%
+git add .
+git commit -m "Update artifacts"
+
+:push
+
+if "%RETRY_TIMES%" == "" (
+    set /a RETRY_TIMES=10
+    set /a SLEEP_TIME=2
+) else (
+    set /a RETRY_TIMES=%RETRY_TIMES%-1
+    set /a SLEEP_TIME=%SLEEP_TIME%*2
+)
+
+git push origin %PUBLISH_BRANCH% -f > nul 2>&1
+
+IF ERRORLEVEL 1 (
+    echo Git push retry times remaining: %RETRY_TIMES%
+    echo Sleep time: %SLEEP_TIME% seconds
+    IF %RETRY_TIMES% EQU 0 (
+        echo Push failed
+        goto err
+    )
+    waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul
+    goto push
+) ELSE (
+    set RETRY_TIMES=
+    set SLEEP_TIME=
+)
+
+popd
+
+exit /b 0
+
+:err
+
+popd
+
+exit /b 1
diff --git a/packaging/windows/internal/setup.bat b/packaging/windows/internal/setup.bat
new file mode 100644
index 0000000..96cb7fb
--- /dev/null
+++ b/packaging/windows/internal/setup.bat
@@ -0,0 +1,44 @@
+@echo off
+
+echo The flags after configuring:
+echo NO_CUDA=%NO_CUDA%
+echo CMAKE_GENERATOR=%CMAKE_GENERATOR%
+if "%NO_CUDA%"==""  echo CUDA_PATH=%CUDA_PATH%
+if NOT "%CC%"==""   echo CC=%CC%
+if NOT "%CXX%"==""  echo CXX=%CXX%
+if NOT "%DISTUTILS_USE_SDK%"==""  echo DISTUTILS_USE_SDK=%DISTUTILS_USE_SDK%
+
+set SRC_DIR=%~dp0\..
+
+IF "%VSDEVCMD_ARGS%" == "" (
+    call "%VS15VCVARSALL%" x64
+) ELSE (
+    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS%
+)
+
+pushd %SRC_DIR%
+
+IF NOT exist "setup.py" (
+    cd %MODULE_NAME%
+)
+
+if "%CXX%"=="sccache cl" (
+    sccache --stop-server
+    sccache --start-server
+    sccache --zero-stats
+)
+
+:pytorch
+:: This stores in e.g. D:/_work/1/s/windows/output/cpu
+pip wheel -e . --no-deps --wheel-dir ../output
+
+:build_end
+IF ERRORLEVEL 1 exit /b 1
+IF NOT ERRORLEVEL 0 exit /b 1
+
+if "%CXX%"=="sccache cl" (
+    taskkill /im sccache.exe /f /t || ver > nul
+    taskkill /im nvcc.exe /f /t || ver > nul
+)
+
+cd ..
diff --git a/packaging/windows/internal/test.bat b/packaging/windows/internal/test.bat
new file mode 100644
index 0000000..8e6878b
--- /dev/null
+++ b/packaging/windows/internal/test.bat
@@ -0,0 +1,79 @@
+@echo off
+
+set SRC_DIR=%~dp0\..
+pushd %SRC_DIR%
+
+set PYTHON_VERSION=%PYTHON_PREFIX:py=cp%
+
+if "%BUILD_CSPRNG%" == "" (
+    pip install future pytest coverage hypothesis protobuf
+) ELSE (
+    pip install future pytest "pillow>=4.1.1"
+)
+
+for /F "delims=" %%i in ('where /R %SRC_DIR%\output *%MODULE_NAME%*%PYTHON_VERSION%*.whl') do pip install "%%i"
+
+if ERRORLEVEL 1 exit /b 1
+
+if NOT "%BUILD_CSPRNG%" == "" (
+    echo Smoke testing imports
+    python -c "import torchcsprng"
+    if ERRORLEVEL 1 exit /b 1
+    goto smoke_test_end
+)
+
+echo Smoke testing imports
+python -c "import torch"
+if ERRORLEVEL 1 exit /b 1
+
+python -c "from caffe2.python import core"
+if ERRORLEVEL 1 exit /b 1
+
+echo Checking that MKL is available
+python -c "import torch; exit(0 if torch.backends.mkl.is_available() else 1)"
+if ERRORLEVEL 1 exit /b 1
+
+setlocal EnableDelayedExpansion
+set NVIDIA_GPU_EXISTS=0
+for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
+    set GPUS=%%i
+    if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
+        SET NVIDIA_GPU_EXISTS=1
+        goto gpu_check_end
+    )
+)
+:gpu_check_end
+endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
+
+if NOT "%CUDA_PREFIX%" == "cpu" if "%NVIDIA_GPU_EXISTS%" == "1" (
+    echo Checking that CUDA archs are setup correctly
+    python -c "import torch; torch.randn([3,5]).cuda()"
+    if ERRORLEVEL 1 exit /b 1
+
+    echo Checking that magma is available
+    python -c "import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)"
+    if ERRORLEVEL 1 exit /b 1
+
+    echo Checking that CuDNN is available
+    python -c "import torch; exit(0 if torch.backends.cudnn.is_available() else 1)"
+    if ERRORLEVEL 1 exit /b 1
+)
+:smoke_test_end
+
+echo Not running unit tests. Hopefully these problems are caught by CI
+goto test_end
+
+if "%BUILD_CSPRNG%" == "" (
+    cd pytorch\test
+    python run_test.py -v
+) else (
+    cd csprng
+    pytest .
+)
+
+if ERRORLEVEL 1 exit /b 1
+
+:test_end
+
+popd
+exit /b 0
diff --git a/packaging/windows/internal/upload.bat b/packaging/windows/internal/upload.bat
new file mode 100644
index 0000000..f78fe0b
--- /dev/null
+++ b/packaging/windows/internal/upload.bat
@@ -0,0 +1,96 @@
+@echo off
+
+IF "%CONDA_UPLOADER_INSTALLATION%" == "" goto precheck_fail
+IF "%PYTORCH_FINAL_PACKAGE_DIR%" == "" goto precheck_fail
+IF "%today%" == "" goto precheck_fail
+IF "%PYTORCH_ANACONDA_USERNAME%" == "" goto precheck_fail
+IF "%PYTORCH_ANACONDA_PASSWORD%" == "" goto precheck_fail
+
+goto precheck_pass
+
+:precheck_fail
+
+echo Please run nightly_defaults.bat first.
+echo And remember to set `PYTORCH_FINAL_PACKAGE_DIR`
+echo Finally, don't forget to set anaconda tokens
+exit /b 1
+
+:precheck_pass
+
+pushd %today%
+
+:: Install anaconda client
+set "CONDA_HOME=%CONDA_UPLOADER_INSTALLATION%"
+set "tmp_conda=%CONDA_HOME%"
+set "miniconda_exe=%CD%\miniconda.exe"
+rmdir /s /q "%CONDA_HOME%"
+del miniconda.exe
+curl -k https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "%miniconda_exe%"
+popd
+
+IF ERRORLEVEL 1 (
+    echo Conda download failed
+    exit /b 1
+)
+
+call %~dp0\..\..\conda\install_conda.bat
+
+IF ERRORLEVEL 1 (
+    echo Conda installation failed
+    exit /b 1
+)
+
+set "ORIG_PATH=%PATH%"
+set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%"
+
+REM conda install -y anaconda-client
+pip install git+https://github.com/peterjc123/anaconda-client.git@log_more_meaningfull_errors
+IF ERRORLEVEL 1 (
+    echo Anaconda client installation failed
+    exit /b 1
+)
+
+set PYTORCH_FINAL_PACKAGE=
+:: Upload all the packages under `PYTORCH_FINAL_PACKAGE_DIR`
+FOR /F "delims=" %%i IN ('where /R %PYTORCH_FINAL_PACKAGE_DIR% *csprng*.tar.bz2') DO (
+    set "PYTORCH_FINAL_PACKAGE=%%i"
+)
+
+IF "%PYTORCH_FINAL_PACKAGE%" == "" (
+    echo No package to upload
+    exit /b 0
+)
+
+:upload
+
+if "%RETRY_TIMES%" == "" (
+    set /a RETRY_TIMES=10
+    set /a SLEEP_TIME=2
+) else (
+    set /a RETRY_TIMES=%RETRY_TIMES%-1
+    set /a SLEEP_TIME=%SLEEP_TIME%*2
+)
+
+REM bash -c "yes | anaconda login --username "%PYTORCH_ANACONDA_USERNAME%" --password "%PYTORCH_ANACONDA_PASSWORD%""
+anaconda login --username "%PYTORCH_ANACONDA_USERNAME%" --password "%PYTORCH_ANACONDA_PASSWORD%"
+IF ERRORLEVEL 1 (
+    echo Anaconda client login failed
+    exit /b 1
+)
+
+echo Uploading %PYTORCH_FINAL_PACKAGE% to Anaconda Cloud
+anaconda upload "%PYTORCH_FINAL_PACKAGE%" -u pytorch-nightly --label main --force --no-progress
+
+IF ERRORLEVEL 1 (
+    echo Anaconda upload retry times remaining: %RETRY_TIMES%
+    echo Sleep time: %SLEEP_TIME% seconds
+    IF %RETRY_TIMES% EQU 0 (
+        echo Upload failed
+        exit /b 1
+    )
+    waitfor SomethingThatIsNeverHappening /t %SLEEP_TIME% 2>nul || ver >nul
+    goto upload
+) ELSE (
+    set RETRY_TIMES=
+    set SLEEP_TIME=
+)
diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat
new file mode 100644
index 0000000..e85a372
--- /dev/null
+++ b/packaging/windows/internal/vc_env_helper.bat
@@ -0,0 +1,43 @@
+@echo on
+
+set VC_VERSION_LOWER=16
+set VC_VERSION_UPPER=17
+if "%VC_YEAR%" == "2017" (
+    set VC_VERSION_LOWER=15
+    set VC_VERSION_UPPER=16
+)
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+if "%VSDEVCMD_ARGS%" == "" (
+    call "%VS15VCVARSALL%" x64 || exit /b 1
+) else (
+    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
+)
+
+@echo on
+
+set DISTUTILS_USE_SDK=1
+
+set args=%1
+shift
+:start
+if [%1] == [] goto done
+set args=%args% %1
+shift
+goto start
+
+:done
+if "%args%" == "" (
+    echo Usage: vc_env_helper.bat [command] [args]
+    echo e.g. vc_env_helper.bat cl /c test.cpp
+)
+
+%args% || exit /b 1
diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh
new file mode 100644
index 0000000..cdae180
--- /dev/null
+++ b/packaging/windows/internal/vc_install_helper.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -ex
+
+if [[ "$CU_VERSION" == "cu92" ]]; then
+  export VC_YEAR=2017
+  export VSDEVCMD_ARGS="-vcvars_ver=14.13"
+  powershell packaging/windows/internal/vs2017_install.ps1
+elif [[ "$CU_VERSION" == "cu100" ]]; then
+  export VC_YEAR=2017
+  export VSDEVCMD_ARGS=""
+  powershell packaging/windows/internal/vs2017_install.ps1
+else
+  export VC_YEAR=2019
+  export VSDEVCMD_ARGS=""
+fi
diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1
new file mode 100644
index 0000000..3e953de
--- /dev/null
+++ b/packaging/windows/internal/vs2017_install.ps1
@@ -0,0 +1,25 @@
+$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe"
+$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.14.13",
+                                                     "--add Microsoft.Component.MSBuild",
+                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
+                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
+                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
+
+curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS 2017 installer failed"
+    exit 1
+}
+
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
+    exit 1
+}
diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1
new file mode 100644
index 0000000..e436051
--- /dev/null
+++ b/packaging/windows/internal/vs2019_install.ps1
@@ -0,0 +1,21 @@
+$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe"
+$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
+                                                     "--add Microsoft.Component.MSBuild",
+                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
+                                                     "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64")
+
+curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS 2019 installer failed"
+    exit 1
+}
+
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
+    exit 1
+}
diff --git a/packaging/windows/internal/vs_install.bat b/packaging/windows/internal/vs_install.bat
new file mode 100644
index 0000000..348a5e3
--- /dev/null
+++ b/packaging/windows/internal/vs_install.bat
@@ -0,0 +1,14 @@
+@echo off
+
+set VS_DOWNLOAD_LINK=https://aka.ms/vs/15/release/vs_enterprise.exe
+set VS_INSTALL_PATH=C:\Program Files (x86)\Microsoft Visual Studio\2017\Enterprise
+set VS_INSTALL_ARGS=--nocache --quiet --wait --add Microsoft.VisualStudio.Component.VC.Tools.14.11
+set VSDEVCMD_ARGS=-vcvars_ver=14.11
+
+curl -k -L %VS_DOWNLOAD_LINK% --output vs_installer.exe
+if errorlevel 1 exit /b 1
+
+start /wait vs_installer.exe modify --installPath "%VS_INSTALL_PATH%" %VS_INSTALL_ARGS%
+if not errorlevel 0 exit /b 1
+if errorlevel 1 if not errorlevel 3010 exit /b 1
+if errorlevel 3011 exit /b 1
diff --git a/packaging/windows/old/cuda100.bat b/packaging/windows/old/cuda100.bat
new file mode 100644
index 0000000..f088bca
--- /dev/null
+++ b/packaging/windows/old/cuda100.bat
@@ -0,0 +1,59 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+set NO_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+    exit /b 1
+    goto optcheck
+)
+
+IF "%CUDA_PATH_V10_0%"=="" (
+    echo CUDA 10.0 not found, failing
+    exit /b 1
+) ELSE (
+    IF "%BUILD_CSPRNG%" == "" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX;6.0;6.1;7.0;7.5
+        set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+    ) ELSE (
+        set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50
+    )
+
+    set "CUDA_PATH=%CUDA_PATH_V10_0%"
+    set "PATH=%CUDA_PATH_V10_0%\bin;%PATH%"
+)
+
+:optcheck
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/old/cuda90.bat b/packaging/windows/old/cuda90.bat
new file mode 100644
index 0000000..520b794
--- /dev/null
+++ b/packaging/windows/old/cuda90.bat
@@ -0,0 +1,59 @@
+@echo off
+
+IF NOT "%BUILD_CSPRNG%" == "" (
+    set MODULE_NAME=csprng
+) ELSE (
+    set MODULE_NAME=pytorch
+)
+
+IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
+    call internal\clone.bat
+    cd ..
+    IF ERRORLEVEL 1 goto eof
+) ELSE (
+    call internal\clean.bat
+)
+
+call internal\check_deps.bat
+IF ERRORLEVEL 1 goto eof
+
+REM Check for optional components
+
+set NO_CUDA=
+set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
+
+IF "%NVTOOLSEXT_PATH%"=="" (
+    echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+    exit /b 1
+    goto optcheck
+)
+
+IF "%CUDA_PATH_V9_0%"=="" (
+    echo CUDA 9 not found, failing
+    exit /b 1
+) ELSE (
+    IF "%BUILD_CSPRNG%" == "" (
+        set TORCH_CUDA_ARCH_LIST=3.5;5.0+PTX;6.0;7.0
+        set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
+    ) ELSE (
+        set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50
+    )
+
+    set "CUDA_PATH=%CUDA_PATH_V9_0%"
+    set "PATH=%CUDA_PATH_V9_0%\bin;%PATH%"
+)
+
+:optcheck
+
+IF "%BUILD_CSPRNG%" == "" (
+    call internal\check_opts.bat
+    IF ERRORLEVEL 1 goto eof
+
+    call internal\copy.bat
+    IF ERRORLEVEL 1 goto eof
+)
+
+call internal\setup.bat
+IF ERRORLEVEL 1 goto eof
+
+:eof
diff --git a/packaging/windows/templates/auth_task.yml b/packaging/windows/templates/auth_task.yml
new file mode 100644
index 0000000..7554ffa
--- /dev/null
+++ b/packaging/windows/templates/auth_task.yml
@@ -0,0 +1,17 @@
+jobs:
+- job: 'VSTS_Auth_Task'
+  timeoutInMinutes: 5
+  cancelTimeoutInMinutes: 5
+  variables:
+  - group: 'peterjc-vsts-token'
+
+  pool:
+    vmImage: 'vs2017-win2016'
+
+  steps:
+  - checkout: self
+    clean: true
+
+  - template: vsts_auth.yml
+    parameters:
+      auth: $(vsts_auth)
diff --git a/packaging/windows/templates/build_conda.yml b/packaging/windows/templates/build_conda.yml
new file mode 100644
index 0000000..ce29c06
--- /dev/null
+++ b/packaging/windows/templates/build_conda.yml
@@ -0,0 +1,15 @@
+parameters:
+  msagent: false
+
+steps:
+- bash: 'find . -name "*.sh" -exec dos2unix {} +'
+  displayName: Replace file endings
+
+- script: 'if not exist %PYTORCH_FINAL_PACKAGE_DIR% mkdir %PYTORCH_FINAL_PACKAGE_DIR%'
+  displayName: 'Create final package directory'
+
+- bash: './packaging/conda/build_csprng.sh $CUDA_VERSION $TORCHCSPRNG_BUILD_VERSION $TORCHCSPRNG_BUILD_NUMBER'
+  displayName: Build
+  env:
+    ${{ if eq(parameters.msagent, 'true') }}:
+      MAX_JOBS: 2
diff --git a/packaging/windows/templates/build_task.yml b/packaging/windows/templates/build_task.yml
new file mode 100644
index 0000000..18d4f8e
--- /dev/null
+++ b/packaging/windows/templates/build_task.yml
@@ -0,0 +1,173 @@
+parameters:
+  package: ''
+  spec: ''
+  jobDesc: ''
+  packageDesc: ''
+  msagent: true
+  cpuEnabled: true
+  cudaEnabled: true
+  condaEnabled: true
+  wheelsEnabled: true
+  override: false
+
+jobs:
+- job: 'Windows_${{ parameters.spec }}_${{ parameters.package }}_Build'
+  timeoutInMinutes: 60
+  cancelTimeoutInMinutes: 5
+  condition: > 
+    or(and(eq('${{ parameters.package }}', 'Conda'), eq('${{ parameters.spec }}', 'CPU'),
+           eq('${{ parameters.condaEnabled }}', 'true'), eq('${{ parameters.cpuEnabled }}', 'true')),
+       and(eq('${{ parameters.package }}', 'Wheels'), eq('${{ parameters.spec }}', 'CPU'),
+           eq('${{ parameters.wheelsEnabled }}', 'true'), eq('${{ parameters.cpuEnabled }}', 'true')),
+       and(eq('${{ parameters.package }}', 'Conda'), eq('${{ parameters.spec }}', 'CUDA'),
+           eq('${{ parameters.condaEnabled }}', 'true'), eq('${{ parameters.cudaEnabled }}', 'true')),
+       and(eq('${{ parameters.package }}', 'Wheels'), eq('${{ parameters.spec }}', 'CUDA'),
+           eq('${{ parameters.wheelsEnabled }}', 'true'), eq('${{ parameters.cudaEnabled }}', 'true')))
+  variables:
+    - ${{ if eq(parameters.override, 'true') }}:
+      - name: TORCHCSPRNG_BUILD_NUMBER
+        value: 1
+      - name: PYTORCH_REPO
+        value: 'pytorch'
+      - name: PYTORCH_BRANCH
+        value: 'v0.4.0'
+    - ${{ if eq(parameters.msagent, 'true') }}:
+      - name: USE_SCCACHE
+        value: 0
+    - ${{ if eq(parameters.msagent, 'false') }}:
+      - name: USE_SCCACHE
+        value: 1
+    - ${{ if eq(parameters.package, 'Conda') }}:
+      - group: peterjc_anaconda_token
+      - name: PYTORCH_FINAL_PACKAGE_DIR
+        value: '$(Build.Repository.LocalPath)\packaging\windows\output'
+      
+  strategy:
+    maxParallel: 10
+    matrix:
+      ${{ if eq(parameters.spec, 'CPU') }}:
+        PY3.5:
+          DESIRED_PYTHON: 3.5
+          CUDA_VERSION: cpu
+        PY3.6:
+          DESIRED_PYTHON: 3.6
+          CUDA_VERSION: cpu
+        PY3.7:
+          DESIRED_PYTHON: 3.7
+          CUDA_VERSION: cpu
+        PY3.8:
+          DESIRED_PYTHON: 3.8
+          CUDA_VERSION: cpu
+        PY3.9:
+          DESIRED_PYTHON: 3.9
+          CUDA_VERSION: cpu
+      ${{ if ne(parameters.spec, 'CPU') }}:
+        PY3.5_92:
+          DESIRED_PYTHON: 3.5
+          CUDA_VERSION: 92
+        PY3.6_92:
+          DESIRED_PYTHON: 3.6
+          CUDA_VERSION: 92
+        PY3.7_92:
+          DESIRED_PYTHON: 3.7
+          CUDA_VERSION: 92
+        PY3.8_92:
+          DESIRED_PYTHON: 3.8
+          CUDA_VERSION: 92
+        PY3.9_92:
+          DESIRED_PYTHON: 3.9
+          CUDA_VERSION: 92
+        PY3.5_101:
+          DESIRED_PYTHON: 3.5
+          CUDA_VERSION: 101
+        PY3.6_101:
+          DESIRED_PYTHON: 3.6
+          CUDA_VERSION: 101
+        PY3.7_101:
+          DESIRED_PYTHON: 3.7
+          CUDA_VERSION: 101
+        PY3.8_101:
+          DESIRED_PYTHON: 3.8
+          CUDA_VERSION: 101
+        PY3.9_101:
+          DESIRED_PYTHON: 3.9
+          CUDA_VERSION: 101
+        PY3.5_102:
+          DESIRED_PYTHON: 3.5
+          CUDA_VERSION: 102
+        PY3.6_102:
+          DESIRED_PYTHON: 3.6
+          CUDA_VERSION: 102
+        PY3.7_102:
+          DESIRED_PYTHON: 3.7
+          CUDA_VERSION: 102
+        PY3.8_102:
+          DESIRED_PYTHON: 3.8
+          CUDA_VERSION: 102
+        PY3.9_102:
+          DESIRED_PYTHON: 3.9
+          CUDA_VERSION: 102
+
+  pool:
+    ${{ if eq(parameters.msagent, 'true') }}:
+      vmImage: 'vs2017-win2016'
+    ${{ if eq(parameters.msagent, 'false') }}:
+      name: 'release'
+
+  steps:
+  - checkout: self
+    clean: true
+
+  - template: setup_env_for_msagent.yml
+    parameters:
+      msagent: ${{ parameters.msagent }}
+
+  # - ${{ if and(eq(parameters.override, 'true'),  eq(parameters.package, 'Wheels')) }}:
+  #   - template: override_pytorch_version.yml
+
+  - template: setup_nightly_variables.yml
+    parameters:
+      package: ${{ parameters.package }}
+
+  - ${{ if eq(parameters.package, 'Wheels') }}:
+    - template: build_wheels.yml
+      parameters:
+        msagent: ${{ parameters.msagent }}
+
+  - ${{ if eq(parameters.package, 'Conda') }}:
+    - template: build_conda.yml
+      parameters:
+        msagent: ${{ parameters.msagent }}
+
+  - ${{ if or(eq(parameters.package, 'Wheels'), eq(parameters.package, 'Conda')) }}:
+    - template: publish_test_results.yml
+      parameters:
+        msagent: ${{ parameters.msagent }}
+
+  # If you want to upload binaries to S3 & Anaconda Cloud, please uncomment this section.
+  - ${{ if and(eq(parameters.package, 'Wheels'), eq(parameters.spec, 'CPU')) }}:
+    - template: upload_to_s3.yml
+      parameters:
+        cuVer: '$(CUDA_VERSION)'
+        cudaVer: '$(CUDA_VERSION)'
+
+  - ${{ if and(eq(parameters.package, 'Wheels'), ne(parameters.spec, 'CPU')) }}:
+    - template: upload_to_s3.yml
+      parameters:
+        cuVer: 'cu$(CUDA_VERSION)'
+        cudaVer: 'cuda$(CUDA_VERSION)'
+
+  - ${{ if eq(parameters.package, 'Conda') }}:
+    - template: upload_to_conda.yml
+      parameters:
+        user: $(peterjc_conda_username)
+        pass: $(peterjc_conda_password)
+
+  # If you want to upload binaries to Azure Git, please uncomment this section.
+  # - ${{ if or(eq(parameters.package, 'Wheels'), eq(parameters.package, 'Conda')) }}:
+  #   - template: publish_test_results.yml
+  #     parameters:
+  #       msagent: ${{ parameters.msagent }}
+  #   - template: publish_packages.yml
+  #     parameters:
+  #       package: ${{ parameters.package }}
diff --git a/packaging/windows/templates/build_wheels.yml b/packaging/windows/templates/build_wheels.yml
new file mode 100644
index 0000000..8393fdb
--- /dev/null
+++ b/packaging/windows/templates/build_wheels.yml
@@ -0,0 +1,9 @@
+parameters:
+  msagent: false
+
+steps:
+- script: 'call packaging/windows/build_csprng.bat %CUDA_VERSION% %TORCHCSPRNG_BUILD_VERSION% %TORCHCSPRNG_BUILD_NUMBER%'
+  displayName: Build
+  env:
+    ${{ if eq(parameters.msagent, 'true') }}:
+      MAX_JOBS: 2
diff --git a/packaging/windows/templates/linux_build_task.yml b/packaging/windows/templates/linux_build_task.yml
new file mode 100644
index 0000000..0b32892
--- /dev/null
+++ b/packaging/windows/templates/linux_build_task.yml
@@ -0,0 +1,38 @@
+parameters:
+  msagent: true
+  enabled: false
+
+jobs:
+- job: 'Linux_CPU_Conda_Build'
+  timeoutInMinutes: 0
+  cancelTimeoutInMinutes: 5
+  condition: ${{ eq(parameters.enabled, 'true') }}
+  variables:
+    CUDA_VERSION: cpu
+    TORCH_CONDA_BUILD_FOLDER: pytorch-nightly
+    PYTORCH_FINAL_PACKAGE_DIR: '$(Build.Repository.LocalPath)/output'
+
+  strategy:
+    maxParallel: 10
+    matrix:
+      PY3.5:
+        DESIRED_PYTHON: 3.5
+
+  pool:
+    vmImage: 'ubuntu-16.04'
+
+  steps:
+  - checkout: self
+    clean: true
+
+  - script: 'sudo apt-get install p7zip-full'
+    displayName: 'Install 7Zip'
+
+  - task: CondaEnvironment@1
+    displayName: 'Install conda-build'
+    inputs:
+      packageSpecs: 'conda-build'
+
+  - template: build_conda.yml
+    parameters:
+      msagent: ${{ parameters.msagent }}
diff --git a/packaging/windows/templates/override_pytorch_version.yml b/packaging/windows/templates/override_pytorch_version.yml
new file mode 100644
index 0000000..8af93ae
--- /dev/null
+++ b/packaging/windows/templates/override_pytorch_version.yml
@@ -0,0 +1,6 @@
+steps:
+- script: 'windows/internal/override_pytorch_version.bat'
+  displayName: 'Override PyTorch Build Version for Wheels'
+
+- script: 'echo $(PYTORCH_BUILD_VERSION)'
+  displayName: 'Show PyTorch Build Version'
diff --git a/packaging/windows/templates/publish_packages.yml b/packaging/windows/templates/publish_packages.yml
new file mode 100644
index 0000000..51ce824
--- /dev/null
+++ b/packaging/windows/templates/publish_packages.yml
@@ -0,0 +1,8 @@
+parameters:
+  package: ''
+
+steps:
+- script: 'packaging/windows/internal/publish.bat'
+  displayName: 'Upload packages to Azure DevOps Repo'
+  env:
+    PACKAGEFULLNAME: ${{ parameters.package }}
diff --git a/packaging/windows/templates/publish_test_results.yml b/packaging/windows/templates/publish_test_results.yml
new file mode 100644
index 0000000..1e0dc02
--- /dev/null
+++ b/packaging/windows/templates/publish_test_results.yml
@@ -0,0 +1,6 @@
+steps:
+- task: PublishTestResults@2 # No test results to publish
+  inputs:
+    testResultsFiles: 'windows/pytorch/test/**/*.xml'
+    testRunTitle: 'Publish test results'
+  enabled: false
diff --git a/packaging/windows/templates/setup_env_for_msagent.yml b/packaging/windows/templates/setup_env_for_msagent.yml
new file mode 100644
index 0000000..377734f
--- /dev/null
+++ b/packaging/windows/templates/setup_env_for_msagent.yml
@@ -0,0 +1,25 @@
+parameters:
+   msagent: false
+
+steps:
+- ${{ if eq(parameters.msagent, 'true') }}:
+  - task: BatchScript@1
+    displayName: 'Install 7Zip & cURL'
+    inputs:
+      filename: 'packaging/windows/internal/dep_install.bat'
+
+      modifyEnvironment: true
+
+  - task: BatchScript@1
+    displayName: 'Install Visual Studio 2017'
+    inputs:
+      filename: 'packaging/windows/internal/vs_install.bat'
+
+      modifyEnvironment: true
+
+  - task: BatchScript@1
+    displayName: 'Install CUDA'
+    inputs:
+      filename: 'packaging/windows/internal/cuda_install.bat'
+
+      modifyEnvironment: true
diff --git a/packaging/windows/templates/setup_nightly_variables.yml b/packaging/windows/templates/setup_nightly_variables.yml
new file mode 100644
index 0000000..94b2fe9
--- /dev/null
+++ b/packaging/windows/templates/setup_nightly_variables.yml
@@ -0,0 +1,11 @@
+parameters:
+  package: ''
+
+steps:
+- task: BatchScript@1
+  displayName: 'Setup nightly variables'
+  inputs:
+    filename: 'packaging/windows/internal/nightly_defaults.bat'
+    arguments: ${{ parameters.package }}
+
+    modifyEnvironment: true
diff --git a/packaging/windows/templates/upload_to_conda.yml b/packaging/windows/templates/upload_to_conda.yml
new file mode 100644
index 0000000..dc172bc
--- /dev/null
+++ b/packaging/windows/templates/upload_to_conda.yml
@@ -0,0 +1,10 @@
+parameters:
+  user: ''
+  pass: ''
+
+steps:
+- script: 'call packaging/windows/internal/upload.bat'
+  displayName: 'Upload packages to Anaconda Cloud'
+  env:
+    PYTORCH_ANACONDA_USERNAME: ${{ parameters.user }}
+    PYTORCH_ANACONDA_PASSWORD: ${{ parameters.pass }}
diff --git a/packaging/windows/templates/upload_to_s3.yml b/packaging/windows/templates/upload_to_s3.yml
new file mode 100644
index 0000000..1de91b5
--- /dev/null
+++ b/packaging/windows/templates/upload_to_s3.yml
@@ -0,0 +1,15 @@
+parameters:
+  cuVer: ''
+  cudaVer: ''
+
+steps:
+- task: AmazonWebServices.aws-vsts-tools.S3Upload.S3Upload@1
+  displayName: 'Upload ${{ parameters.cuVer }} wheel to S3'
+  inputs:
+    awsCredentials: 'Pytorch S3 bucket'
+    bucketName: 'pytorch'
+    sourceFolder: 'packaging/windows/output'
+    globExpressions: '*.whl'
+    targetFolder: 'whl/nightly/${{ parameters.cuVer }}/'
+    filesAcl: 'public-read'
+    flattenFolders: 'true'
diff --git a/packaging/windows/templates/vsts_auth.yml b/packaging/windows/templates/vsts_auth.yml
new file mode 100644
index 0000000..fde767d
--- /dev/null
+++ b/packaging/windows/templates/vsts_auth.yml
@@ -0,0 +1,8 @@
+parameters:
+  auth: ''
+
+steps:
+- script: 'call packaging/windows/internal/auth.bat'
+  displayName: 'Sign in to Azure Pipelines'
+  env:
+    VSTS_AUTH: ${{ parameters.auth }}
diff --git a/setup.py b/setup.py
index cc77d67..5143b53 100644
--- a/setup.py
+++ b/setup.py
@@ -1,94 +1,193 @@
+import distutils.command.clean
+import glob
 import os
-from sys import platform
+import shutil
 import subprocess
-from setuptools import setup
-from torch.utils import cpp_extension
-
-cu_version = os.getenv('CU_VERSION', default=None)
-if cu_version is None:
-    use_cuda = os.getenv('USE_CUDA', default=None)
-    if use_cuda is None:
-        build_cuda = cpp_extension.CUDA_HOME is not None
-    else:
-        build_cuda = use_cuda
-else:
-    build_cuda = cu_version != 'cpu'
-
-CXX_FLAGS = []
-if platform != "darwin":
-    CXX_FLAGS.append('-fopenmp')
-
-NVCC_FLAGS = os.getenv('NVCC_FLAGS', '')
-if NVCC_FLAGS == '':
-    NVCC_FLAGS = []
-else:
-    NVCC_FLAGS = NVCC_FLAGS.split(' ')
-# TODO: replace with a loop:
-if '--expt-extended-lambda' not in NVCC_FLAGS:
-    NVCC_FLAGS.append('--expt-extended-lambda')
-if '-Xcompiler' not in NVCC_FLAGS:
-    NVCC_FLAGS.append('-Xcompiler')
-if '-fopenmp' not in NVCC_FLAGS:
-    NVCC_FLAGS.append('-fopenmp')
-# NVCC_FLAGS = ['--expt-extended-lambda', '-Xcompiler', '-fopenmp']
-
-module_name = 'torch_csprng'
-
-this_dir = os.path.dirname(os.path.abspath(__file__))
-extensions_dir = os.path.join(this_dir, module_name, 'csrc')
-
-if build_cuda:
-    csprng_ext = cpp_extension.CUDAExtension(
-        module_name, [os.path.join(extensions_dir, 'csprng.cu')],
-        extra_compile_args={'cxx': [],
-                            'nvcc': NVCC_FLAGS}
-    )
-else:
-    csprng_ext = cpp_extension.CppExtension(
-        module_name, [os.path.join(extensions_dir, 'csprng.cpp')],
-        extra_compile_args={'cxx': CXX_FLAGS}
-    )
+import sys
+
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDA_HOME,
+    CUDAExtension,
+)
+
+version = open("version.txt", "r").read().strip()
+sha = "Unknown"
+package_name = "torchcsprng"
 
-version = open('version.txt', 'r').read().strip()
-sha = 'Unknown'
-package_name = 'pytorch_csprng'
+cwd = os.path.dirname(os.path.abspath(__file__))
 
 try:
-    sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=this_dir).decode('ascii').strip()
+    sha = (
+        subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd)
+        .decode("ascii")
+        .strip()
+    )
 except Exception:
     pass
 
-if os.getenv('BUILD_VERSION'):
-    version = os.getenv('BUILD_VERSION')
-elif sha != 'Unknown':
-    version += '+' + sha[:7]
+if os.getenv("BUILD_VERSION"):
+    version = os.getenv("BUILD_VERSION")
+elif sha != "Unknown":
+    version += "+" + sha[:7]
 print("Building wheel {}-{}".format(package_name, version))
 
-# Doesn't work yet :(
-# version_path = os.path.join(this_dir, module_name, 'version.py')
-# with open(version_path, 'w') as f:
-#     f.write("__version__ = '{}'\n".format(version))
-#     f.write("git_version = {}\n".format(repr(sha)))
+
+def write_version_file():
+    version_path = os.path.join(cwd, "torchcsprng", "version.py")
+    with open(version_path, "w") as f:
+        f.write("__version__ = '{}'\n".format(version))
+        f.write("git_version = {}\n".format(repr(sha)))
+        # f.write("from torchcsprng.extension import _check_cuda_version\n")
+        # f.write("if _check_cuda_version() > 0:\n")
+        # f.write("    cuda = _check_cuda_version()\n")
+
+
+write_version_file()
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
 
+
+requirements = [
+    "torch",
+]
+
+
+def append_flags(flags, flags_to_append):
+    for flag in flags_to_append:
+        if not flag in flags:
+            flags.append(flag)
+    return flags
+
+
+def get_extensions():
+    build_cuda = torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1"
+
+    module_name = "torchcsprng"
+
+    extensions_dir = os.path.join(cwd, module_name, "csrc")
+
+    openmp = "ATen parallel backend: OpenMP" in torch.__config__.parallel_info()
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+
+    define_macros = []
+
+    cxx_flags = os.getenv("CXX_FLAGS", "")
+    if cxx_flags == "":
+        cxx_flags = []
+    else:
+        cxx_flags = cxx_flags.split(" ")
+    if openmp:
+        if sys.platform == "linux":
+            cxx_flags = append_flags(cxx_flags, ["-fopenmp"])
+        elif sys.platform == "win32":
+            cxx_flags = append_flags(cxx_flags, ["/openmp"])
+        # elif sys.platform == 'darwin':
+        #     cxx_flags = append_flags(cxx_flags, ['-Xpreprocessor', '-fopenmp'])
+
+    if build_cuda:
+        extension = CUDAExtension
+        source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+        sources += source_cuda
+
+        define_macros += [("WITH_CUDA", None)]
+
+        nvcc_flags = os.getenv("NVCC_FLAGS", "")
+        if nvcc_flags == "":
+            nvcc_flags = []
+        else:
+            nvcc_flags = nvcc_flags.split(" ")
+        nvcc_flags = append_flags(nvcc_flags, ["--expt-extended-lambda", "-Xcompiler"])
+        extra_compile_args = {
+            "cxx": cxx_flags,
+            "nvcc": nvcc_flags,
+        }
+    else:
+        extra_compile_args = {
+            "cxx": cxx_flags,
+        }
+
+    ext_modules = [
+        extension(
+            module_name + "._C",
+            sources,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        with open(".gitignore", "r") as f:
+            ignores = f.read()
+            start_deleting = False
+            for wildcard in filter(None, ignores.split("\n")):
+                if (
+                    wildcard
+                    == "# do not change or delete this comment - `python setup.py clean` deletes everything after this line"
+                ):
+                    start_deleting = True
+                if not start_deleting:
+                    continue
+                for filename in glob.glob(wildcard):
+                    try:
+                        os.remove(filename)
+                    except OSError:
+                        shutil.rmtree(filename, ignore_errors=True)
+
+        # It's an old-style class in Python 2.7...
+        distutils.command.clean.clean.run(self)
+
+
 setup(
+    # Metadata
     name=package_name,
     version=version,
     author="Pavel Belevich",
     author_email="pbelevich@fb.com",
-    description="Cryptographically secure pseudorandom number generators for PyTorch",
-    # long_description=long_description,
-    # long_description_content_type="text/markdown",
-    license='BSD-3',
     url="https://github.com/pytorch/csprng",
+    description="Cryptographically secure pseudorandom number generators for PyTorch",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    license="BSD-3",
+    # Package info
+    packages=find_packages(exclude=("test",)),
     classifiers=[
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: C++",
         "Programming Language :: Python :: 3",
-        'License :: OSI Approved :: BSD License',
-        'Programming Language :: C++',
-        'Programming Language :: Python :: 3',
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering",
+        "Topic :: Scientific/Engineering :: Mathematics",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development",
+        "Topic :: Software Development :: Libraries",
+        "Topic :: Software Development :: Libraries :: Python Modules",
     ],
-    python_requires='>=3.6',
-    ext_modules=[csprng_ext],
-    cmdclass={'build_ext': cpp_extension.BuildExtension})
+    python_requires=">=3.6",
+    install_requires=requirements,
+    ext_modules=get_extensions(),
+    test_suite="test",
+    cmdclass={
+        "build_ext": BuildExtension,
+        "clean": clean,
+    },
+)
diff --git a/test/__init__.py b/test/__init__.py
index e69de29..83766c4 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/test/test_csprng.py b/test/test_csprng.py
index 8447458..3245ffd 100644
--- a/test/test_csprng.py
+++ b/test/test_csprng.py
@@ -1,36 +1,71 @@
-import unittest
-import torch
-from scipy import stats
-import numpy as np
+# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import math
+import os
 import random
 import time
+import unittest
+
+import numpy as np
+import torch
+from Crypto.Cipher import AES
+from Crypto.Util import Counter
+from scipy import stats
 
 try:
-    import torch_csprng as csprng
+    import torchcsprng as csprng
 except ImportError:
     raise RuntimeError("CSPRNG not available")
 
+IS_SANDCASTLE = (
+    os.getenv("SANDCASTLE") == "1" or os.getenv("TW_JOB_USER") == "sandcastle"
+)
+IS_FBCODE = os.getenv("PYTORCH_TEST_FBCODE") == "1"
+
+
+def to_numpy(t, dtype=torch.float):
+    if t.dtype == torch.bfloat16:
+        t = t.to(dtype)
+    return t.numpy()
+
+
+def to_bytes(t):
+    if t.dtype == torch.bfloat16:
+        t = t.view(torch.int16)
+    return t.cpu().numpy().view(np.int8)
+
+
 class TestCSPRNG(unittest.TestCase):
 
     all_generators = [
         csprng.create_random_device_generator(),
-        csprng.create_random_device_generator('/dev/urandom'),
+        csprng.create_random_device_generator("/dev/urandom"),
         csprng.create_mt19937_generator(),
-        csprng.create_mt19937_generator(42)
+        csprng.create_mt19937_generator(42),
     ]
 
     int_dtypes = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
 
-    fp_ftypes = [torch.float, torch.double]
+    standard_fp_dtypes = [torch.float, torch.double]
+
+    non_standard_fp_dtypes = [torch.half, torch.bfloat16]
+
+    fp_dtypes = standard_fp_dtypes + non_standard_fp_dtypes
 
-    num_dtypes = int_dtypes + fp_ftypes
+    num_dtypes = int_dtypes + fp_dtypes
 
     all_dtypes = num_dtypes + [torch.bool]
 
     size = 1000
 
-    all_devices = ['cpu', 'cuda'] if csprng.supports_cuda() else ['cpu']
+    all_devices = (
+        ["cpu", "cuda"]
+        if (torch.cuda.is_available() and csprng.supports_cuda())
+        else ["cpu"]
+    )
 
     def test_random_kstest(self):
         for device in self.all_devices:
@@ -40,20 +75,38 @@ def test_random_kstest(self):
                         to_inc = 2**24
                     elif dtype == torch.double:
                         to_inc = 2**53
+                    elif dtype == torch.half:
+                        to_inc = 2**11
+                    elif dtype == torch.bfloat16:
+                        to_inc = 2**8
                     else:
                         to_inc = torch.iinfo(dtype).max
 
-                    t = torch.empty(self.size, dtype=dtype, device=device).random_(generator=gen)
-                    res = stats.kstest(t.cpu(), stats.randint.cdf, args=(0, to_inc))
+                    t = torch.empty(self.size, dtype=dtype, device=device).random_(
+                        generator=gen
+                    )
+                    res = stats.kstest(
+                        to_numpy(t.cpu()), stats.randint.cdf, args=(0, to_inc)
+                    )
                     self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    no_cuda = not torch.cuda.is_available() or not csprng.supports_cuda()
+
+    no_cuda_message = (
+        "CUDA is not available or csprng was not compiled with CUDA support"
+    )
+
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_random_cpu_vs_cuda(self):
         for dtype in self.num_dtypes:
             gen = csprng.create_mt19937_generator(42)
-            cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').random_(generator=gen)
+            cpu_t = torch.empty(self.size, dtype=dtype, device="cpu").random_(
+                generator=gen
+            )
             gen = csprng.create_mt19937_generator(42)
-            cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').random_(generator=gen)
+            cuda_t = torch.empty(self.size, dtype=dtype, device="cuda").random_(
+                generator=gen
+            )
             self.assertTrue((cpu_t == cuda_t.cpu()).all())
 
     def test_random_to_kstest(self):
@@ -61,18 +114,26 @@ def test_random_to_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
                 for dtype in self.num_dtypes:
-                    t = torch.zeros(self.size, dtype=dtype, device=device).random_(to_, generator=gen)
-                    res = stats.kstest(t.cpu(), stats.randint.cdf, args=(0, to_))
+                    t = torch.zeros(self.size, dtype=dtype, device=device).random_(
+                        to_, generator=gen
+                    )
+                    res = stats.kstest(
+                        to_numpy(t.cpu()), stats.randint.cdf, args=(0, to_)
+                    )
                     self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_random_to_cpu_vs_cuda(self):
         to_ = 42
         for dtype in self.num_dtypes:
             gen = csprng.create_mt19937_generator(42)
-            cpu_t = torch.zeros(self.size, dtype=dtype, device='cpu').random_(to_, generator=gen)
+            cpu_t = torch.zeros(self.size, dtype=dtype, device="cpu").random_(
+                to_, generator=gen
+            )
             gen = csprng.create_mt19937_generator(42)
-            cuda_t = torch.zeros(self.size, dtype=dtype, device='cuda').random_(to_, generator=gen)
+            cuda_t = torch.zeros(self.size, dtype=dtype, device="cuda").random_(
+                to_, generator=gen
+            )
             self.assertTrue((cpu_t == cuda_t.cpu()).all())
 
     def test_random_from_to_kstest(self):
@@ -82,20 +143,30 @@ def test_random_from_to_kstest(self):
                     for from_ in [0, 24, 42]:
                         for to_ in [42, 99, 123]:
                             if from_ < to_:
-                                t = torch.zeros(self.size, dtype=dtype, device=device).random_(from_, to_, generator=gen)
-                                res = stats.kstest(t.cpu(), stats.randint.cdf, args=(from_, to_))
+                                t = torch.zeros(
+                                    self.size, dtype=dtype, device=device
+                                ).random_(from_, to_, generator=gen)
+                                res = stats.kstest(
+                                    to_numpy(t.cpu()),
+                                    stats.randint.cdf,
+                                    args=(from_, to_),
+                                )
                                 self.assertTrue(res.statistic < 0.2)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_random_from_to_cpu_vs_cuda(self):
         for dtype in self.num_dtypes:
             for from_ in [0, 24, 42]:
                 for to_ in [42, 99, 123]:
                     if from_ < to_:
                         gen = csprng.create_mt19937_generator(42)
-                        cpu_t = torch.zeros(self.size, dtype=dtype, device='cpu').random_(from_, to_, generator=gen)
+                        cpu_t = torch.zeros(
+                            self.size, dtype=dtype, device="cpu"
+                        ).random_(from_, to_, generator=gen)
                         gen = csprng.create_mt19937_generator(42)
-                        cuda_t = torch.zeros(self.size, dtype=dtype, device='cuda').random_(from_, to_, generator=gen)
+                        cuda_t = torch.zeros(
+                            self.size, dtype=dtype, device="cuda"
+                        ).random_(from_, to_, generator=gen)
                         self.assertTrue((cpu_t == cuda_t.cpu()).all())
 
     def test_random_bool(self):
@@ -107,147 +178,225 @@ def test_random_bool(self):
                 t.random_(generator=gen)
                 self.assertEqual(t.min(), False)
                 self.assertEqual(t.max(), True)
-                self.assertTrue(0.4 < (t.eq(True)).to(torch.int).sum().item() / self.size < 0.6)
+                self.assertTrue(
+                    0.4 < (t.eq(True)).to(torch.int).sum().item() / self.size < 0.6
+                )
 
                 t.fill_(True)
                 t.random_(generator=gen)
                 self.assertEqual(t.min(), False)
                 self.assertEqual(t.max(), True)
-                self.assertTrue(0.4 < (t.eq(True)).to(torch.int).sum().item() / self.size < 0.6)
+                self.assertTrue(
+                    0.4 < (t.eq(True)).to(torch.int).sum().item() / self.size < 0.6
+                )
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_random_bool_cpu_vs_cuda(self):
         gen = csprng.create_mt19937_generator(42)
-        cpu_t = torch.empty(self.size, dtype=torch.bool, device='cpu').random_(generator=gen)
+        cpu_t = torch.empty(self.size, dtype=torch.bool, device="cpu").random_(
+            generator=gen
+        )
         gen = csprng.create_mt19937_generator(42)
-        cuda_t = torch.empty(self.size, dtype=torch.bool, device='cuda').random_(generator=gen)
+        cuda_t = torch.empty(self.size, dtype=torch.bool, device="cuda").random_(
+            generator=gen
+        )
         self.assertTrue((cpu_t == cuda_t.cpu()).all())
 
     def test_uniform_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for from_ in [-42, 0, 4.2]:
                         for to_ in [-4.2, 0, 42]:
                             if to_ > from_:
-                                t = torch.empty(self.size, dtype=dtype, device=device).uniform_(from_, to_, generator=gen)
-                                res = stats.kstest(t.cpu().to(torch.double), 'uniform', args=(from_, (to_ - from_)))
+                                t = torch.empty(
+                                    self.size, dtype=dtype, device=device
+                                ).uniform_(from_, to_, generator=gen)
+                                res = stats.kstest(
+                                    to_numpy(t.cpu(), torch.double),
+                                    "uniform",
+                                    args=(from_, (to_ - from_)),
+                                )
                                 self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_uniform_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for from_ in [-42, 0, 4.2]:
                 for to_ in [-4.2, 0, 42]:
                     if to_ > from_:
                         gen = csprng.create_mt19937_generator(42)
-                        cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').uniform_(from_, to_, generator=gen)
+                        cpu_t = torch.empty(
+                            self.size, dtype=dtype, device="cpu"
+                        ).uniform_(from_, to_, generator=gen)
                         gen = csprng.create_mt19937_generator(42)
-                        cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').uniform_(from_, to_, generator=gen)
-                        self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-9)
+                        cuda_t = torch.empty(
+                            self.size, dtype=dtype, device="cuda"
+                        ).uniform_(from_, to_, generator=gen)
+                        self.assertTrue(torch.allclose(cpu_t, cuda_t.cpu(), 1e-9))
 
     def test_normal_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for mean in [-3, 0, 7]:
                         for std in [1, 5, 7]:
-                            t = torch.empty(self.size, dtype=dtype, device=device).normal_(mean=mean, std=std, generator=gen)
-                            res = stats.kstest(t.cpu().to(torch.double), 'norm', args=(mean, std))
+                            t = torch.empty(
+                                self.size, dtype=dtype, device=device
+                            ).normal_(mean=mean, std=std, generator=gen)
+                            res = stats.kstest(
+                                to_numpy(t.cpu(), torch.double),
+                                "norm",
+                                args=(mean, std),
+                            )
                             self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_normal_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for mean in [-3, 0, 7]:
                 for std in [1, 5, 7]:
                     gen = csprng.create_mt19937_generator(42)
-                    cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').normal_(mean=mean, std=std, generator=gen)
+                    cpu_t = torch.empty(self.size, dtype=dtype, device="cpu").normal_(
+                        mean=mean, std=std, generator=gen
+                    )
                     gen = csprng.create_mt19937_generator(42)
-                    cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').normal_(mean=mean, std=std, generator=gen)
-                    self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-9)
+                    cuda_t = torch.empty(self.size, dtype=dtype, device="cuda").normal_(
+                        mean=mean, std=std, generator=gen
+                    )
+                    self.assertTrue(torch.allclose(cpu_t, cuda_t.cpu(), 1e-9))
 
     def test_log_normal_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for mean in [-3, 0, 7]:
                         for std in [1, 5, 7]:
-                            t = torch.empty(self.size, dtype=dtype, device=device).log_normal_(mean=mean, std=std, generator=gen)
-                            res = stats.kstest(t.cpu().to(torch.double), 'lognorm', args=(std, 0, math.exp(mean)))
-                            self.assertTrue(res.statistic < 0.1)
+                            t = torch.empty(
+                                self.size, dtype=dtype, device=device
+                            ).log_normal_(mean=mean, std=std, generator=gen)
+                            res = stats.kstest(
+                                to_numpy(t.cpu(), torch.double),
+                                "lognorm",
+                                args=(std, 0, math.exp(mean)),
+                            )
+                            if dtype in [torch.half, torch.bfloat16]:
+                                self.assertTrue(res.statistic < 0.4)
+                            else:
+                                self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_log_normal_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for mean in [-3, 0, 7]:
                 for std in [1, 5, 7]:
                     gen = csprng.create_mt19937_generator(42)
-                    cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').log_normal_(mean=mean, std=std, generator=gen)
+                    cpu_t = torch.empty(
+                        self.size, dtype=dtype, device="cpu"
+                    ).log_normal_(mean=mean, std=std, generator=gen)
                     gen = csprng.create_mt19937_generator(42)
-                    cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').log_normal_(mean=mean, std=std, generator=gen)
-                    self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-4)
+                    cuda_t = torch.empty(
+                        self.size, dtype=dtype, device="cuda"
+                    ).log_normal_(mean=mean, std=std, generator=gen)
+                    self.assertTrue(
+                        torch.allclose(cpu_t, cuda_t.cpu(), 1e-4, equal_nan=True)
+                    )
 
     def test_exponential_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for lambd in [0.5, 1.0, 5.0]:
-                        t = torch.empty(self.size, dtype=dtype, device=device).exponential_(lambd=lambd, generator=gen)
-                        res = stats.kstest(t.cpu().to(torch.double), 'expon', args=(0, 1 / lambd,))
+                        t = torch.empty(
+                            self.size, dtype=dtype, device=device
+                        ).exponential_(lambd=lambd, generator=gen)
+                        res = stats.kstest(
+                            to_numpy(t.cpu(), torch.double),
+                            "expon",
+                            args=(
+                                0,
+                                1 / lambd,
+                            ),
+                        )
                         self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
+    @unittest.skip("https://github.com/pytorch/pytorch/issues/38662")
     def test_exponential_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for lambd in [0.5, 1.0, 5.0]:
                 gen = csprng.create_mt19937_generator(42)
-                cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').exponential_(lambd=lambd, generator=gen)
+                cpu_t = torch.empty(self.size, dtype=dtype, device="cpu").exponential_(
+                    lambd=lambd, generator=gen
+                )
                 gen = csprng.create_mt19937_generator(42)
-                cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').exponential_(lambd=lambd, generator=gen)
-                self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-9)
+                cuda_t = torch.empty(
+                    self.size, dtype=dtype, device="cuda"
+                ).exponential_(lambd=lambd, generator=gen)
+                self.assertTrue(torch.allclose(cpu_t, cuda_t.cpu(), 1e-9))
 
     def test_cauchy_kstest(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for median in [-10, 0, 50]:
                         for sigma in [0.5, 1.0, 10.0]:
-                            t = torch.empty(self.size, dtype=dtype, device=device).cauchy_(median=median, sigma=sigma, generator=gen)
-                            res = stats.kstest(t.cpu().to(torch.double), 'cauchy', args=(median, sigma))
-                            self.assertTrue(res.statistic < 0.1)
+                            t = torch.empty(
+                                self.size, dtype=dtype, device=device
+                            ).cauchy_(median=median, sigma=sigma, generator=gen)
+                            res = stats.kstest(
+                                to_numpy(t.cpu(), torch.double),
+                                "cauchy",
+                                args=(median, sigma),
+                            )
+                            if dtype in [torch.half, torch.bfloat16]:
+                                self.assertTrue(res.statistic < 0.4)
+                            else:
+                                self.assertTrue(res.statistic < 0.1)
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_cauchy_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for median in [-10, 0, 50]:
                 for sigma in [0.5, 1.0, 10.0]:
                     gen = csprng.create_mt19937_generator(42)
-                    cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').cauchy_(median=median, sigma=sigma, generator=gen)
+                    cpu_t = torch.empty(self.size, dtype=dtype, device="cpu").cauchy_(
+                        median=median, sigma=sigma, generator=gen
+                    )
                     gen = csprng.create_mt19937_generator(42)
-                    cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').cauchy_(median=median, sigma=sigma, generator=gen)
-                    self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-9)
+                    cuda_t = torch.empty(self.size, dtype=dtype, device="cuda").cauchy_(
+                        median=median, sigma=sigma, generator=gen
+                    )
+                    self.assertTrue(torch.allclose(cpu_t, cuda_t.cpu(), 1e-9))
 
     def test_geometric(self):
         for device in self.all_devices:
             for gen in self.all_generators:
-                for dtype in self.fp_ftypes:
+                for dtype in self.fp_dtypes:
                     for p in [0.2, 0.5, 0.8]:
-                        t = torch.empty(self.size, dtype=dtype, device=device).geometric_(p=p, generator=gen)
+                        t = torch.empty(
+                            self.size, dtype=dtype, device=device
+                        ).geometric_(p=p, generator=gen)
                         # actual = np.histogram(t.cpu().to(torch.double), np.arange(1, 100))[0]
                         # expected = stats.geom(p).pmf(np.arange(1, 99)) * self.size
                         # res = stats.chisquare(actual, expected)
                         # self.assertAlmostEqual(res.pvalue, 1.0, delta=0.5) TODO https://github.com/pytorch/csprng/issues/7
 
-    @unittest.skipIf(not csprng.supports_cuda(), "csprng was not compiled with CUDA support")
+    @unittest.skipIf(no_cuda, no_cuda_message)
     def test_geometric_cpu_vs_cuda(self):
-        for dtype in self.fp_ftypes:
+        for dtype in self.fp_dtypes:
             for p in [0.2, 0.5, 0.8]:
                 gen = csprng.create_mt19937_generator(42)
-                cpu_t = torch.empty(self.size, dtype=dtype, device='cpu').geometric_(p=p, generator=gen)
+                cpu_t = torch.empty(self.size, dtype=dtype, device="cpu").geometric_(
+                    p=p, generator=gen
+                )
                 gen = csprng.create_mt19937_generator(42)
-                cuda_t = torch.empty(self.size, dtype=dtype, device='cuda').geometric_(p=p, generator=gen)
-                self.assertTrue((cpu_t - cuda_t.cpu()).abs().max() < 1e-9)
+                cuda_t = torch.empty(self.size, dtype=dtype, device="cuda").geometric_(
+                    p=p, generator=gen
+                )
+                self.assertTrue(
+                    torch.allclose(cpu_t, cuda_t.cpu(), 1e-9, equal_nan=True)
+                )
 
     def test_non_contiguous_vs_contiguous(self):
         size = 10
@@ -262,7 +411,7 @@ def test_non_contiguous_vs_contiguous(self):
                     y2 = random.randrange(y1 + 1, max(y1 + 2, size))
                     z2 = random.randrange(z1 + 1, max(z1 + 2, size))
                     maybe_non_contiguous = t[x1:x2, y1:y2, z1:z2]
-                    assert(maybe_non_contiguous.numel() > 0)
+                    assert maybe_non_contiguous.numel() > 0
 
                     if not maybe_non_contiguous.is_contiguous():
                         seed = random.randrange(1000)
@@ -275,23 +424,28 @@ def test_non_contiguous_vs_contiguous(self):
                         gen = csprng.create_mt19937_generator(seed)
                         contiguous.random_(generator=gen)
 
-                        assert(contiguous.is_contiguous())
+                        assert contiguous.is_contiguous()
                         self.assertTrue((non_contiguous == contiguous).all())
 
                         for x in range(0, size):
                             for y in range(0, size):
                                 for z in range(0, size):
-                                    if not x1 <= x < x2 and not y1 <= y < y2 and not z1 <= z < z2:
+                                    if (
+                                        not x1 <= x < x2
+                                        and not y1 <= y < y2
+                                        and not z1 <= z < z2
+                                    ):
                                         self.assertTrue(t[x, y, z] == 0)
 
+    @unittest.skipIf(IS_SANDCASTLE or IS_FBCODE, "Does not work on Sandcastle")
     @unittest.skipIf(torch.get_num_threads() < 2, "requires multithreading CPU")
     def test_cpu_parallel(self):
-        urandom_gen = csprng.create_random_device_generator('/dev/urandom')
+        urandom_gen = csprng.create_random_device_generator("/dev/urandom")
 
         def measure(size):
-            t = torch.empty(size, dtype=torch.float32, device='cpu')
+            t = torch.empty(size, dtype=torch.float32, device="cpu")
             start = time.time()
-            for i in range(10):
+            for i in range(20):
                 t.normal_(generator=urandom_gen)
             finish = time.time()
             return finish - start
@@ -299,7 +453,202 @@ def measure(size):
         time_for_1K = measure(1000)
         time_for_1M = measure(1000000)
         # Pessimistic check that parallel execution gives >= 1.5 performance boost
-        self.assertTrue(time_for_1M/time_for_1K < 1000 / min(1.5, torch.get_num_threads()))
+        self.assertTrue(time_for_1M / time_for_1K < 1000 / 1.5)
 
-if __name__ == '__main__':
+    @unittest.skipIf(IS_SANDCASTLE or IS_FBCODE, "Does not work on Sandcastle")
+    def test_version(self):
+        self.assertTrue(csprng.__version__)
+        self.assertTrue(csprng.git_version)
+
+    def test_randperm(self):
+        for device in self.all_devices:
+            for gen in self.all_generators:
+                for dtype in self.int_dtypes:
+                    for size in range(0, 20):
+                        expected = torch.arange(size, dtype=dtype, device=device)
+
+                        actual = torch.randperm(
+                            size, dtype=dtype, device=device, generator=gen
+                        )
+
+                        actual_out = torch.empty(1, dtype=dtype, device=device)
+                        torch.randperm(size, out=actual_out, generator=gen)
+
+                        if size >= 10:
+                            self.assertTrue(not torch.allclose(expected, actual))
+                            self.assertTrue(not torch.allclose(expected, actual_out))
+
+                        actual = actual.sort()[0]
+                        actual_out = actual.sort()[0]
+
+                        self.assertTrue(torch.allclose(expected, actual))
+                        self.assertTrue(torch.allclose(expected, actual_out))
+
+    def test_encrypt_decrypt(self):
+        key_size_bytes = 16
+        block_size_bytes = 16
+
+        def sizeof(dtype):
+            if dtype == torch.bool:
+                return 1
+            elif dtype.is_floating_point:
+                return torch.finfo(dtype).bits // 8
+            else:
+                return torch.iinfo(dtype).bits // 8
+
+        def pad(data, pad_size):
+            if len(data) % pad_size == 0:
+                return data
+            length = pad_size - (len(data) % pad_size)
+            return data + bytes([0]) * length
+
+        def create_aes(m, k):
+            if m == "ecb":
+                return AES.new(k.tobytes(), AES.MODE_ECB)
+            elif m == "ctr":
+                ctr = Counter.new(
+                    AES.block_size * 8, initial_value=0, little_endian=True
+                )
+                return AES.new(k.tobytes(), AES.MODE_CTR, counter=ctr)
+            else:
+                return None
+
+        for key_dtype in self.all_dtypes:
+            key_size = key_size_bytes // sizeof(key_dtype)
+            key = torch.empty(key_size, dtype=key_dtype).random_()
+            key_np = to_bytes(key)
+            for initial_dtype in self.all_dtypes:
+                for initial_size in [0, 4, 8, 15, 16, 23, 42]:
+                    initial = torch.empty(initial_size, dtype=initial_dtype).random_()
+                    initial_np = to_bytes(initial)
+                    initial_size_bytes = initial_size * sizeof(initial_dtype)
+                    for encrypted_dtype in self.all_dtypes:
+                        encrypted_size = (
+                            (initial_size_bytes + block_size_bytes - 1)
+                            // block_size_bytes
+                            * block_size_bytes
+                            // sizeof(encrypted_dtype)
+                        )
+                        encrypted = torch.zeros(encrypted_size, dtype=encrypted_dtype)
+                        for decrypted_dtype in self.all_dtypes:
+                            decrypted_size = (
+                                initial_size_bytes + sizeof(decrypted_dtype) - 1
+                            ) // sizeof(decrypted_dtype)
+                            decrypted = torch.zeros(
+                                decrypted_size, dtype=decrypted_dtype
+                            )
+                            for mode in ["ecb", "ctr"]:
+                                for device in self.all_devices:
+                                    key = key.to(device)
+                                    initial = initial.to(device)
+                                    encrypted = encrypted.to(device)
+                                    decrypted = decrypted.to(device)
+
+                                    csprng.encrypt(
+                                        initial, encrypted, key, "aes128", mode
+                                    )
+                                    encrypted_np = to_bytes(encrypted)
+
+                                    aes = create_aes(mode, key_np)
+
+                                    encrypted_expected = np.frombuffer(
+                                        aes.encrypt(
+                                            pad(initial_np.tobytes(), block_size_bytes)
+                                        ),
+                                        dtype=np.int8,
+                                    )
+                                    self.assertTrue(
+                                        np.array_equal(encrypted_np, encrypted_expected)
+                                    )
+
+                                    csprng.decrypt(
+                                        encrypted, decrypted, key, "aes128", mode
+                                    )
+                                    decrypted_np = to_bytes(decrypted)[
+                                        :initial_size_bytes
+                                    ]
+
+                                    aes = create_aes(mode, key_np)
+
+                                    decrypted_expected = np.frombuffer(
+                                        aes.decrypt(
+                                            pad(
+                                                encrypted_np.tobytes(), block_size_bytes
+                                            )
+                                        ),
+                                        dtype=np.int8,
+                                    )[:initial_size_bytes]
+                                    self.assertTrue(
+                                        np.array_equal(decrypted_np, decrypted_expected)
+                                    )
+
+                                    self.assertTrue(
+                                        np.array_equal(initial_np, decrypted_np)
+                                    )
+
+    def test_encrypt_decrypt_inplace(self):
+        key_size_bytes = 16
+
+        def sizeof(dtype):
+            if dtype == torch.bool:
+                return 1
+            elif dtype.is_floating_point:
+                return torch.finfo(dtype).bits // 8
+            else:
+                return torch.iinfo(dtype).bits // 8
+
+        def create_aes(m, k):
+            if m == "ecb":
+                return AES.new(k.tobytes(), AES.MODE_ECB)
+            elif m == "ctr":
+                ctr = Counter.new(
+                    AES.block_size * 8, initial_value=0, little_endian=True
+                )
+                return AES.new(k.tobytes(), AES.MODE_CTR, counter=ctr)
+            else:
+                return None
+
+        for key_dtype in self.all_dtypes:
+            key_size = key_size_bytes // sizeof(key_dtype)
+            key = torch.empty(key_size, dtype=key_dtype).random_()
+            key_np = to_bytes(key)
+            for initial_dtype in self.all_dtypes:
+                for initial_size_bytes in [0, 16, 256]:
+                    initial_size = initial_size_bytes // sizeof(initial_dtype)
+                    initial = torch.empty(initial_size, dtype=initial_dtype).random_()
+                    initial_np = to_bytes(initial)
+                    initial_np_copy = np.copy(initial_np)
+                    for mode in ["ecb", "ctr"]:
+                        for device in self.all_devices:
+                            key = key.to(device)
+                            initial = initial.to(device)
+
+                            csprng.encrypt(initial, initial, key, "aes128", mode)
+                            encrypted_np = to_bytes(initial)
+                            aes = create_aes(mode, key_np)
+                            encrypted_expected = np.frombuffer(
+                                aes.encrypt(initial_np_copy.tobytes()), dtype=np.int8
+                            )
+                            self.assertTrue(
+                                np.array_equal(encrypted_np, encrypted_expected)
+                            )
+
+                            encrypted_np_copy = np.copy(encrypted_np)
+
+                            csprng.decrypt(initial, initial, key, "aes128", mode)
+                            decrypted_np = to_bytes(initial)
+                            aes = create_aes(mode, key_np)
+                            decrypted_expected = np.frombuffer(
+                                aes.decrypt(encrypted_np_copy.tobytes()), dtype=np.int8
+                            )
+                            self.assertTrue(
+                                np.array_equal(decrypted_np, decrypted_expected)
+                            )
+
+                            self.assertTrue(
+                                np.array_equal(initial_np_copy, decrypted_np)
+                            )
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/torch_csprng/csrc/block_cipher.h b/torch_csprng/csrc/block_cipher.h
deleted file mode 100644
index ddde94c..0000000
--- a/torch_csprng/csrc/block_cipher.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#pragma once
-
-#include "macros.h"
-#include <ATen/ATen.h>
-#include <ATen/native/TensorIterator.h>
-#include "OffsetCalculator.cuh"
-#include <ATen/Parallel.h>
-#include <cstdint>
-#include <mutex>
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#include <c10/cuda/CUDAStream.h>
-#include <ATen/cuda/Exceptions.h>
-#endif
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#define UNROLL_IF_CUDA #pragma unroll
-#else
-#define UNROLL_IF_CUDA
-#endif
-
-namespace torch {
-namespace custom_prng {
-
-// Generates `block_t_size`-bytes random key Tensor on CPU 
-// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
-// and passes it to the `device`.
-template<typename RNG>
-at::Tensor key_tensor(c10::optional<at::Generator> generator, size_t block_t_size, at::Device device) {
-  std::lock_guard<std::mutex> lock(generator->mutex());
-  auto gen = at::check_generator<RNG>(generator);
-  auto t = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
-  for (size_t i = 0; i < block_t_size; i++) {
-    t[i] = static_cast<uint8_t>(gen->random());
-  }
-  return t.to(device);
-}
-
-// A simple container for random state sub-blocks that implements RNG interface 
-// with random() and random64() methods, that are used by transformation function
-template<size_t size>
-struct RNGValues {
-  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
-    memcpy(&vals_, vals, size * sizeof(uint64_t));
-  }
-  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
-  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
-private:
-  uint64_t vals_[size];
-  int index = 0;
-};
-
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads,
-// without any assumption about target tensor layout. It uses `index_calc` to find memory locations of
-// the tensor elements.
-// `scalar_t`       is a scalar type equivalent of target tensor dtype
-// `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
-//                  (e.g, 16 bytes random state block can be divided into 16 uint8_t sub-blocks 
-//                  or 8 uint16_t sub-block or 4 uint32_t sub-block or 2 uint64_t sub-blocks)
-// `N`              is a number of sub-block which is used by `transform_func` 
-//                  to generate a random value of specific distribution (e.g. `normal` uses 2)
-// `numel`          is a number of elements in target tensor
-// `block_t_size`   is a number of bytes in cipher's block (e.g. 16 for AES128)
-// `cipher`         is a callable that receives a counter `idx` and returns an encrypted block
-// `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(int idx, scalar_t* data, int64_t numel, size_t block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  if (unroll_factor * idx < numel) {
-    auto block = cipher(idx);
-    UNROLL_IF_CUDA
-    for (auto i = 0; i < unroll_factor; ++i) {
-      const auto li = unroll_factor * idx + i;
-      if (li < numel) {
-        uint64_t vals[N];
-        UNROLL_IF_CUDA
-        for (size_t j = 0; j < N; j++) {
-          vals[j] = (reinterpret_cast<uint_t*>(&block))[N * i + j];
-        }
-        RNGValues<N> rng(vals);
-        data[index_calc(li)] = transform_func(&rng);
-      }
-    }
-  }
-}
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-__global__ static void block_cipher_kernel_cuda(scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
-  block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
-}
-#endif
-
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  for (auto idx = begin; idx < end; ++idx) {
-    block_cipher_kernel_helper<scalar_t, uint_t, N>(idx, data, numel, block_t_size, cipher, transform_func, index_calc);
-  }
-}
-
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t, typename index_calc_t>
-static void block_cipher_kernel_cpu(int64_t total, scalar_t* data, int64_t numel, int block_t_size, cipher_t cipher, transform_t transform_func, index_calc_t index_calc) {
-  if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
-    block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(0, total, data, numel, block_t_size, cipher, transform_func, index_calc);
-  } else {
-    at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
-      block_cipher_kernel_cpu_serial<scalar_t, uint_t, N>(begin, end, data, numel, block_t_size, cipher, transform_func, index_calc);
-    });
-  }
-}
-
-// Runs a block cipher in a counter mode in approximately `numel / (block_t_size / sizeof(uint_t) / N)` CUDA threads.
-// Each CUDA thread generates `block_t_size`-bytes random state and divides it into `block_t_size / sizeof(uint_t)` sub-blocks.
-// Then `transform_func` transforms `N` random state sub-blocks passed in a `RNGValues` to final random values of type `scalar_t`.
-template<typename scalar_t, typename uint_t, size_t N = 1, typename cipher_t, typename transform_t>
-void block_cipher_ctr_mode(at::TensorIterator& iter, int block_t_size, cipher_t cipher, transform_t transform_func) {
-  const auto numel = iter.numel();
-  if (numel == 0) {
-    return;
-  }
-  const int unroll_factor = block_t_size / sizeof(uint_t) / N;
-  const auto block = 256;
-  const auto grid = (numel + (block * unroll_factor) - 1) / (block * unroll_factor);
-  scalar_t* data = (scalar_t*)iter.data_ptr(0);
-  auto offset_calc = make_offset_calculator<1>(iter);
-  auto index_calc_identity = [] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return li; };
-  auto index_calc_offset = [offset_calc] TORCH_CSPRNG_HOST_DEVICE (int li) -> int { return offset_calc.get(li)[0] / sizeof(scalar_t); };
-  if (iter.device_type() == at::kCPU) {
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cpu<scalar_t, uint_t, N, cipher_t, transform_t>(
-        grid * block, data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
-  } else if (iter.device_type() == at::kCUDA) {
-#if defined(__CUDACC__) || defined(__HIPCC__)
-    auto stream = at::cuda::getCurrentCUDAStream();
-    if (iter.output(0).is_contiguous()) {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_identity);
-    } else {
-      block_cipher_kernel_cuda<scalar_t, uint_t, N, cipher_t, transform_t><<<grid, block, 0, stream>>>(
-        data, numel, block_t_size, cipher, transform_func, index_calc_offset);
-    }
-    AT_CUDA_CHECK(cudaGetLastError());
-#else
-    TORCH_CHECK(false, "csprng was compiled without CUDA support");
-#endif
-  } else {
-    TORCH_CHECK(false, "block_cipher_ctr_mode supports only CPU and CUDA devices");
-  }
-}
-
-}}
diff --git a/torch_csprng/csrc/csprng.cpp b/torch_csprng/csrc/csprng.cpp
deleted file mode 100644
index c526086..0000000
--- a/torch_csprng/csrc/csprng.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "csprng.h"
diff --git a/torch_csprng/csrc/csprng.cu b/torch_csprng/csrc/csprng.cu
deleted file mode 100644
index c526086..0000000
--- a/torch_csprng/csrc/csprng.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "csprng.h"
diff --git a/torch_csprng/csrc/csprng.h b/torch_csprng/csrc/csprng.h
deleted file mode 100644
index 39aa3e1..0000000
--- a/torch_csprng/csrc/csprng.h
+++ /dev/null
@@ -1,387 +0,0 @@
-#pragma once
-#include <torch/extension.h>
-#include <torch/library.h>
-#include <ATen/Generator.h>
-#include <ATen/Tensor.h>
-#include <ATen/native/DistributionTemplates.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/core/DistributionsHelper.h>
-#include <memory>
-#include <random>
-#include "macros.h"
-#include "block_cipher.h"
-#include "aes.h"
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#include <c10/cuda/CUDAStream.h>
-#include <ATen/cuda/Exceptions.h>
-#endif
-
-using namespace at;
-using namespace at::native::templates;
-using namespace torch::custom_prng;
-
-inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
-  return (static_cast<uint64_t>(hi) << 32) | lo;
-}
-
-// CUDA CSPRNG is actually CPU generator which is used only to generate a random key on CPU for AES running in a block mode on CUDA 
-struct CustomGeneratorImpl : public c10::GeneratorImpl {
-  CustomGeneratorImpl(bool use_rd)              : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(DispatchKey::CustomRNGKeyId)}, use_rd_{use_rd} {}
-  CustomGeneratorImpl(const std::string& token) : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(DispatchKey::CustomRNGKeyId)}, use_rd_{true}, rd_{token} {}
-  CustomGeneratorImpl(uint64_t seed)            : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(DispatchKey::CustomRNGKeyId)}, use_rd_{false}, mt_{static_cast<unsigned int>(seed)} { }
-  ~CustomGeneratorImpl() = default;
-  uint32_t random() { return use_rd_ ? rd_() : mt_(); }
-  uint64_t random64() { return use_rd_ ? make64BitsFrom32Bits(rd_(), rd_()) : make64BitsFrom32Bits(mt_(), mt_()); }
-
-  void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
-  uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
-  uint64_t seed() override { throw std::runtime_error("not implemented"); }
-  CustomGeneratorImpl* clone_impl() const override { throw std::runtime_error("not implemented"); }
-
-  static DeviceType device_type() { return DeviceType::CPU; }
-
-  bool use_rd_;
-  std::random_device rd_;
-  std::mt19937 mt_;
-};
-
-// ====================================================================================================================
-
-// Applies AES in CTR mode with the `key` for passed TensorIterator iter.
-// `scalar_t`       is a scalar type equivalent of target tensor dtype
-// `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
-//                  (e.g, 16 bytes random state block can be divided into 16 uint8_t sub-blocks 
-//                  or 8 uint16_t sub-block or 4 uint32_t sub-block or 2 uint64_t sub-blocks)
-// `N`              is a number of sub-block which is used by `transform_func` 
-//                  to generate a random value of specific distribution (e.g. `normal` uses 2)
-// `key`            is a CUDA pointer to random key memory block
-// `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
-template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
-void aes_helper(TensorIterator& iter, const uint8_t* key, transform_t transform_func) {
-  block_cipher_ctr_mode<scalar_t, uint_t, N>(iter, aes::block_t_size,
-    [key] TORCH_CSPRNG_HOST_DEVICE (unsigned int idx) -> aes::block_t {
-      aes::block_t block;
-      memset(&block, 0, aes::block_t_size);
-      *(reinterpret_cast<unsigned int*>(&block)) = idx;
-      aes::encrypt(reinterpret_cast<uint8_t*>(&block), key);
-      return block;
-    },
-    transform_func
-  );
-}
-
-// ====================================================================================================================
-
-// A mapping between scalar type and corresponding unsigned integer type of random state sub-block.
-// uint64_t for double and long, uint32_t for the rest
-template <typename T>
-struct UIntType {};
-
-template <> struct UIntType<double> { using type = uint64_t; };
-template <> struct UIntType<float> { using type = uint32_t; };
-template <> struct UIntType<int64_t> { using type = uint64_t; };
-template <> struct UIntType<int32_t> { using type = uint32_t; };
-template <> struct UIntType<int16_t> { using type = uint32_t; };
-template <> struct UIntType<int8_t> { using type = uint32_t; };
-template <> struct UIntType<uint8_t> { using type = uint32_t; };
-template <> struct UIntType<bool> { using type = uint32_t; };
-
-// ==================================================== Random ========================================================
-
-template<typename RNG>
-struct RandomKernel {
-  void operator()(TensorIterator& iter, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_ALL_TYPES_AND(ScalarType::Bool, iter.dtype(), "random_kernel", [&] {
-      aes_helper<scalar_t, UIntType<scalar_t>::type>(iter, key,
-        [] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
-          uniform_int_distribution<scalar_t> random;
-          return random(generator);
-        }
-      );
-    });
-  }
-};
-
-template<typename scalar_t, typename uint_t>
-void random_from_to_kernel_helper(TensorIterator& iter, uint64_t range, int64_t base, const uint8_t* key) {
-  aes_helper<scalar_t, uint_t>(iter, key,
-    [range, base] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
-      uniform_int_from_to_distribution<scalar_t> random(range, base);
-      return random(generator);
-    }
-  );
-}
-
-template<typename scalar_t, typename uint_t>
-void random_full_range_kernel_helper(TensorIterator& iter, const uint8_t* key) {
-  aes_helper<scalar_t, uint_t>(iter, key,
-    [] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
-      uniform_int_full_range_distribution<scalar_t> random;
-      return random(generator);
-    }
-  );
-}
-
-template<typename RNG>
-struct RandomFromToKernel {
-  void operator()(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_from_to_kernel", [&] {
-      if ((
-        std::is_same<scalar_t, int64_t>::value ||
-        std::is_same<scalar_t, double>::value ||
-        std::is_same<scalar_t, float>::value ||
-        std::is_same<scalar_t, at::BFloat16>::value) && range >= 1ULL << 32)
-      {
-        random_from_to_kernel_helper<scalar_t, uint64_t>(iter, range, base, key);
-      } else {
-        random_from_to_kernel_helper<scalar_t, uint32_t>(iter, range, base, key);
-      }
-    });
-  }
-  void operator()(TensorIterator& iter, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel", [&] {
-      if (std::is_same<scalar_t, int64_t>::value ||
-          std::is_same<scalar_t, double>::value ||
-          std::is_same<scalar_t, float>::value ||
-          std::is_same<scalar_t, at::BFloat16>::value)
-      {
-        random_full_range_kernel_helper<scalar_t, uint64_t>(iter, key);
-      } else {
-        TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16");
-      }
-    });
-  }
-};
-
-Tensor& random_(Tensor& self, c10::optional<Generator> generator) {
-  return random_impl<RandomKernel, CustomGeneratorImpl>(self, generator);
-}
-
-Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to, c10::optional<Generator> generator) {
-  return random_from_to_impl<RandomFromToKernel, CustomGeneratorImpl>(self, from, to, generator);
-}
-
-Tensor& random_to(Tensor& self, int64_t to, c10::optional<Generator> generator) {
-  return random_from_to(self, 0, to, generator);
-}
-
-// ==================================================== Uniform =======================================================
-
-template<typename RNG>
-struct UniformKernel {
-  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "uniform_kernel", [&] {
-      aes_helper<scalar_t, uint64_t>(iter, key,
-        [from, to] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
-          uniform_real_distribution<double> uniform(from, to);
-          return static_cast<scalar_t>(uniform(generator));
-        }
-      );
-    });
-  }
-};
-
-Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> generator) {
-  return uniform_impl_<UniformKernel, CustomGeneratorImpl>(self, from, to, generator);
-}
-
-// ==================================================== Normal ========================================================
-
-template<typename RNG>
-struct NormalKernel {
-  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> generator) {
-    auto iter = TensorIterator::nullary_op(self);
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "normal_kernel", [&] {
-      aes_helper<scalar_t, uint64_t, 2>(iter, key,
-        [mean, std] TORCH_CSPRNG_HOST_DEVICE (RNGValues<2>* gen) -> scalar_t {
-          normal_distribution<double> normal(mean, std);
-          return static_cast<scalar_t>(normal(gen));
-        }
-      );
-    });
-  }
-};
-
-Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> generator) {
-  return normal_impl_<NormalKernel, CustomGeneratorImpl>(self, mean, std, generator);
-}
-
-Tensor& normal_Tensor_float_out(Tensor& output, const Tensor& mean, double std, c10::optional<Generator> gen) {
-  return normal_out_impl<NormalKernel, CustomGeneratorImpl>(output, mean, std, gen);
-}
-
-Tensor& normal_float_Tensor_out(Tensor& output, double mean, const Tensor& std, c10::optional<Generator> gen) {
-  return normal_out_impl<NormalKernel, CustomGeneratorImpl>(output, mean, std, gen);
-}
-
-Tensor& normal_Tensor_Tensor_out(Tensor& output, const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
-  return normal_out_impl<NormalKernel, CustomGeneratorImpl>(output, mean, std, gen);
-}
-
-Tensor normal_Tensor_float(const Tensor& mean, double std, c10::optional<Generator> gen) {
-  return normal_impl<NormalKernel, CustomGeneratorImpl>(mean, std, gen);
-}
-
-Tensor normal_float_Tensor(double mean, const Tensor& std, c10::optional<Generator> gen) {
-  return normal_impl<NormalKernel, CustomGeneratorImpl>(mean, std, gen);
-}
-
-Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
-  return normal_impl<NormalKernel, CustomGeneratorImpl>(mean, std, gen);
-}
-
-// ==================================================== Cauchy ========================================================
-
-template<typename RNG>
-struct CauchyKernel {
-  void operator()(TensorIterator& iter, double median, double sigma, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "cauchy_kernel", [&] {
-      aes_helper<scalar_t, uint64_t, 1>(iter, key,
-        [median, sigma] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
-          cauchy_distribution<double> cauchy(median, sigma);
-          return static_cast<scalar_t>(cauchy(gen));
-        }
-      );
-    });
-  }
-};
-
-Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> generator) {
-  return cauchy_impl_<CauchyKernel, CustomGeneratorImpl>(self, median, sigma, generator);
-}
-
-// ================================================== LogNormal =======================================================
-
-template<typename RNG>
-struct LogNormalKernel {
-  void operator()(TensorIterator& iter, double mean, double std, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "log_normal", [&] {
-      aes_helper<scalar_t, uint64_t, 2>(iter, key,
-        [mean, std] TORCH_CSPRNG_HOST_DEVICE (RNGValues<2>* gen) -> scalar_t {
-          lognormal_distribution<double> logNormal(mean, std);
-          return static_cast<scalar_t>(logNormal(gen));
-        }
-      );
-    });
-  }
-};
-
-Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
-  return log_normal_impl_<LogNormalKernel, CustomGeneratorImpl>(self, mean, std, gen);
-}
-
-// ================================================== Geometric =======================================================
-
-template<typename RNG>
-struct GeometricKernel {
-  void operator()(TensorIterator& iter, double p, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "geometric_kernel", [&] {
-      aes_helper<scalar_t, UIntType<scalar_t>::type, 1>(iter, key,
-        [p] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
-          geometric_distribution<scalar_t> geometric(p);
-          return geometric(gen);
-        }
-      );
-    });
-  }
-};
-
-Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
-  return geometric_impl_<GeometricKernel, CustomGeneratorImpl>(self, p, gen);
-}
-
-// ================================================== Exponential =====================================================
-
-template<typename RNG>
-struct ExponentialKernel {
-  void operator()(TensorIterator& iter, double lambda, c10::optional<Generator> generator) {
-    const Tensor key_t = key_tensor<RNG>(generator, aes::block_t_size, iter.device());
-    const auto key = key_t.data_ptr<uint8_t>();
-    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "exponential_kernel", [&] {
-      aes_helper<scalar_t, uint64_t, 1>(iter, key,
-        [lambda] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
-          exponential_distribution<double> exponential(lambda);
-          return static_cast<scalar_t>(exponential(gen));
-        }
-      );
-    });
-  }
-};
-
-Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
-  return exponential_impl_<ExponentialKernel, CustomGeneratorImpl>(self, lambda, gen);
-}
-
-// ====================================================================================================================
-
-Generator create_random_device_generator(c10::optional<std::string> token = c10::nullopt) {
-  if (token.has_value()) {
-    return make_generator<CustomGeneratorImpl>(*token);
-  } else {
-    return make_generator<CustomGeneratorImpl>(true);
-  }
-}
-
-Generator create_mt19937_generator(c10::optional<uint64_t> seed = c10::nullopt) {
-  if (seed.has_value()) {
-    return make_generator<CustomGeneratorImpl>(*seed);
-  } else {
-    return make_generator<CustomGeneratorImpl>(false);
-  }
-}
-
-bool supports_cuda() {
-#if defined(__CUDACC__) || defined(__HIPCC__)
-  return true;
-#else
-  return false;
-#endif
-}
-
-TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
-  // Random
-  m.impl_UNBOXED("random_.from",             random_from_to);
-  m.impl_UNBOXED("random_.to",               random_to);
-  m.impl_UNBOXED("random_",                  random_);
-  // Uniform
-  m.impl_UNBOXED("uniform_",                 uniform_);
-  // Normal
-  m.impl_UNBOXED("normal_",                  normal_);
-  m.impl_UNBOXED("normal.Tensor_float_out",  normal_Tensor_float_out);
-  m.impl_UNBOXED("normal.float_Tensor_out",  normal_float_Tensor_out);
-  m.impl_UNBOXED("normal.Tensor_Tensor_out", normal_Tensor_Tensor_out);
-  m.impl_UNBOXED("normal.Tensor_float",      normal_Tensor_float);
-  m.impl_UNBOXED("normal.float_Tensor",      normal_float_Tensor);
-  m.impl_UNBOXED("normal.Tensor_Tensor",     normal_Tensor_Tensor);
-  // Cauchy
-  m.impl_UNBOXED("cauchy_",                  cauchy_);
-  // LogNormal
-  m.impl_UNBOXED("log_normal_",              log_normal_);
-  // Geometric
-  m.impl_UNBOXED("geometric_",               geometric_);
-  // Exponential
-  m.impl_UNBOXED("exponential_",             exponential_);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("supports_cuda", &supports_cuda);
-  m.def("create_random_device_generator", &create_random_device_generator, py::arg("token") = nullptr);
-  m.def("create_mt19937_generator", &create_mt19937_generator, py::arg("seed") = nullptr);
-}
diff --git a/torchcsprng/__init__.py b/torchcsprng/__init__.py
new file mode 100644
index 0000000..60a98d6
--- /dev/null
+++ b/torchcsprng/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from torchcsprng._C import *
+
+
+try:
+    from .version import __version__, git_version  # noqa: F401
+except ImportError:
+    pass
diff --git a/torchcsprng/__init__.pyi b/torchcsprng/__init__.pyi
new file mode 100644
index 0000000..dcc28c2
--- /dev/null
+++ b/torchcsprng/__init__.pyi
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from torch import Generator, Tensor
+
+def supports_cuda() -> bool: ...
+def create_random_device_generator(token: str = "") -> Generator: ...
+def create_mt19937_generator(seed: int = 0): ...
+def encrypt(input: Tensor, output: Tensor, key: Tensor, cipher, mode): ...
+def decrypt(input: Tensor, output: Tensor, key: Tensor, cipher, mode): ...
+def __version__() -> str: ...
+def git_version() -> str: ...
diff --git a/torch_csprng/csrc/OffsetCalculator.cuh b/torchcsprng/csrc/OffsetCalculator.cuh
similarity index 93%
rename from torch_csprng/csrc/OffsetCalculator.cuh
rename to torchcsprng/csrc/OffsetCalculator.cuh
index 1c76d70..671e37d 100644
--- a/torch_csprng/csrc/OffsetCalculator.cuh
+++ b/torchcsprng/csrc/OffsetCalculator.cuh
@@ -1,3 +1,10 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #include <array>
diff --git a/torch_csprng/csrc/THCIntegerDivider.cuh b/torchcsprng/csrc/THCIntegerDivider.cuh
similarity index 94%
rename from torch_csprng/csrc/THCIntegerDivider.cuh
rename to torchcsprng/csrc/THCIntegerDivider.cuh
index 9d57ef9..bc124b2 100644
--- a/torch_csprng/csrc/THCIntegerDivider.cuh
+++ b/torchcsprng/csrc/THCIntegerDivider.cuh
@@ -1,3 +1,10 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef THC_INTEGER_DIVIDER_INC
 #define THC_INTEGER_DIVIDER_INC
 
diff --git a/torch_csprng/csrc/aes.h b/torchcsprng/csrc/aes.inc
similarity index 66%
rename from torch_csprng/csrc/aes.h
rename to torchcsprng/csrc/aes.inc
index 7a9a287..463dd5c 100644
--- a/torch_csprng/csrc/aes.h
+++ b/torchcsprng/csrc/aes.inc
@@ -1,10 +1,10 @@
-#pragma once
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
 
-#include "macros.h"
-#include <cstdint>
-
-namespace torch {
-namespace custom_prng {
 namespace aes {
 
 // This AES implementation is based on
@@ -55,15 +55,7 @@ namespace aes {
     #define Nr 10       // The number of rounds in AES Cipher.
 #endif
 
-#if !defined(__CUDACC__) && !defined(__HIPCC__)
-struct ulonglong2 // TODO: should have something like `__builtin_align__(16)`
-{
-  unsigned long long int x, y;
-};
-#endif
-
-typedef ulonglong2 block_t;
-constexpr size_t block_t_size = sizeof(block_t);
+constexpr size_t block_t_size = 16;
 
 typedef uint8_t state_t[4][4];
 
@@ -89,6 +81,24 @@ TORCH_CSPRNG_CONSTANT const uint8_t sbox[256] = {
   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
 
+TORCH_CSPRNG_CONSTANT const uint8_t rsbox[256] = {
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
+
 // The round constant word array, Rcon[i], contains the values given by 
 // x to the power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
 TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
@@ -96,6 +106,8 @@ TORCH_CSPRNG_CONSTANT const uint8_t Rcon[11] = {
 
 #define getSBoxValue(num) (sbox[(num)])
 
+#define getSBoxInvert(num) (rsbox[(num)])
+
 // This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
 TORCH_CSPRNG_HOST_DEVICE void KeyExpansion(uint8_t* RoundKey, const uint8_t* Key){
   unsigned int i, j, k;
@@ -249,6 +261,78 @@ TORCH_CSPRNG_HOST_DEVICE void MixColumns(state_t* state)
   }
 }
 
+TORCH_CSPRNG_HOST_DEVICE uint8_t Multiply(uint8_t x, uint8_t y)
+{
+  return (((y & 1) * x) ^
+          ((y>>1 & 1) * xtime(x)) ^
+          ((y>>2 & 1) * xtime(xtime(x))) ^
+          ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^
+          ((y>>4 & 1) * xtime(xtime(xtime(xtime(x)))))); /* this last call to xtime() can be omitted */
+}
+
+// MixColumns function mixes the columns of the state matrix.
+// The method used to multiply may be difficult to understand for the inexperienced.
+// Please use the references to gain more information.
+TORCH_CSPRNG_HOST_DEVICE void InvMixColumns(state_t* state)
+{
+  int i;
+  uint8_t a, b, c, d;
+  for (i = 0; i < 4; ++i)
+  {
+    a = (*state)[i][0];
+    b = (*state)[i][1];
+    c = (*state)[i][2];
+    d = (*state)[i][3];
+
+    (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
+    (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
+    (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
+    (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
+  }
+}
+
+// The SubBytes Function Substitutes the values in the
+// state matrix with values in an S-box.
+TORCH_CSPRNG_HOST_DEVICE void InvSubBytes(state_t* state)
+{
+  uint8_t i, j;
+  for (i = 0; i < 4; ++i)
+  {
+    for (j = 0; j < 4; ++j)
+    {
+      (*state)[j][i] = getSBoxInvert((*state)[j][i]);
+    }
+  }
+}
+
+TORCH_CSPRNG_HOST_DEVICE void InvShiftRows(state_t* state)
+{
+  uint8_t temp;
+
+  // Rotate first row 1 columns to right
+  temp = (*state)[3][1];
+  (*state)[3][1] = (*state)[2][1];
+  (*state)[2][1] = (*state)[1][1];
+  (*state)[1][1] = (*state)[0][1];
+  (*state)[0][1] = temp;
+
+  // Rotate second row 2 columns to right
+  temp = (*state)[0][2];
+  (*state)[0][2] = (*state)[2][2];
+  (*state)[2][2] = temp;
+
+  temp = (*state)[1][2];
+  (*state)[1][2] = (*state)[3][2];
+  (*state)[3][2] = temp;
+
+  // Rotate third row 3 columns to right
+  temp = (*state)[0][3];
+  (*state)[0][3] = (*state)[1][3];
+  (*state)[1][3] = (*state)[2][3];
+  (*state)[2][3] = (*state)[3][3];
+  (*state)[3][3] = temp;
+}
+
 TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   uint8_t RoundKey[176];
   KeyExpansion(RoundKey, key); 
@@ -276,4 +360,29 @@ TORCH_CSPRNG_HOST_DEVICE void encrypt(uint8_t* state, const uint8_t* key) {
   AddRoundKey(Nr, (state_t*)state, RoundKey);
 }
 
-}}}
+TORCH_CSPRNG_HOST_DEVICE void decrypt(uint8_t* state, const uint8_t* key) {
+  uint8_t RoundKey[176];
+  KeyExpansion(RoundKey, key);
+
+  uint8_t round = 0;
+
+  // Add the First round key to the state before starting the rounds.
+  AddRoundKey(Nr, (state_t*)state, RoundKey);
+
+  // There will be Nr rounds.
+  // The first Nr-1 rounds are identical.
+  // These Nr rounds are executed in the loop below.
+  // Last one without InvMixColumn()
+  for (round = (Nr - 1); ; --round)
+  {
+    InvShiftRows((state_t*)state);
+    InvSubBytes((state_t*)state);
+    AddRoundKey(round, (state_t*)state, RoundKey);
+    if (round == 0) {
+      break;
+    }
+    InvMixColumns((state_t*)state);
+  }
+}
+
+}
diff --git a/torchcsprng/csrc/block_cipher.h b/torchcsprng/csrc/block_cipher.h
new file mode 100644
index 0000000..a949d52
--- /dev/null
+++ b/torchcsprng/csrc/block_cipher.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "macros.h"
+#include <ATen/ATen.h>
+#include <ATen/native/TensorIterator.h>
+#include "OffsetCalculator.cuh"
+#include <ATen/Parallel.h>
+#include <cstdint>
+#include <mutex>
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <c10/cuda/CUDAStream.h>
+#include <ATen/cuda/Exceptions.h>
+#endif
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define UNROLL_IF_CUDA #pragma unroll
+#else
+#define UNROLL_IF_CUDA
+#endif
+
+namespace torch {
+namespace csprng {
+
+template<typename input_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_input_to_block(int64_t idx, uint8_t* block, int block_size,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc) {
+  for (auto i = 0; i < block_size / input_type_size; ++i) {
+    const auto linear_index = idx * (block_size / input_type_size) + i;
+    if (linear_index < input_numel) {
+      std::memcpy(
+          block + i * input_type_size,
+          &(reinterpret_cast<uint8_t*>(input_ptr)[input_index_calc(linear_index)]),
+          input_type_size
+      );
+    }
+  }
+}
+
+template<typename output_index_calc_t>
+TORCH_CSPRNG_HOST_DEVICE static void copy_block_to_output(int64_t idx, uint8_t* block, int output_elem_per_block,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc) {
+  for (auto i = 0; i < output_elem_per_block; ++i) {
+    const auto linear_index = idx * output_elem_per_block + i;
+    if (linear_index < output_numel) {
+      std::memcpy(
+          &(reinterpret_cast<uint8_t*>(output_ptr)[output_index_calc(linear_index)]),
+          block + i * output_type_size,
+          output_type_size
+      );
+    }
+  }
+}
+
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+TORCH_CSPRNG_HOST_DEVICE static void block_cipher_kernel_helper(
+    int64_t idx, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
+  uint8_t block[block_size];
+  std::memset(&block, 0, block_size); // is it ok to use zeros as padding?
+  if (input_ptr != nullptr) {
+    copy_input_to_block(idx, block, block_size, input_ptr, input_numel, input_type_size, input_index_calc);
+  }
+  cipher(idx, block);
+  transform(block);
+  copy_block_to_output(idx, block, output_elem_per_block, output_ptr, output_numel, output_type_size, output_index_calc);
+}
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+__global__ static void block_cipher_kernel_cuda(cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
+    input_ptr, input_numel, input_type_size, input_index_calc,
+    output_ptr, output_numel, output_type_size, output_index_calc,
+    transform);
+}
+#endif
+
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu_serial(int64_t begin, int64_t end, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform) {
+  for (auto idx = begin; idx < end; ++idx) {
+    block_cipher_kernel_helper<block_size>(idx, cipher, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform);
+  }
+}
+
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+static void block_cipher_kernel_cpu(int64_t total, cipher_t cipher, int output_elem_per_block,
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    transform_t transform_func) {
+  if (total < at::internal::GRAIN_SIZE || at::get_num_threads() == 1) {
+    block_cipher_kernel_cpu_serial<block_size>(0, total, cipher, output_elem_per_block,
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      transform_func);
+  } else {
+    at::parallel_for(0, total, at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) {
+      block_cipher_kernel_cpu_serial<block_size>(begin, end, cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func);
+    });
+  }
+}
+
+template<int block_size, typename cipher_t, typename input_index_calc_t, typename output_index_calc_t, typename transform_t>
+void block_cipher(
+    void* input_ptr, int64_t input_numel, int input_type_size, input_index_calc_t input_index_calc,
+    void* output_ptr, int64_t output_numel, int output_type_size, output_index_calc_t output_index_calc,
+    at::Device device, cipher_t cipher, int output_elem_per_block, transform_t transform_func) {
+  if (output_ptr == nullptr || output_numel == 0) {
+    return;
+  }
+
+  if (device.type() == at::kCPU) {
+    const auto total = (output_numel + output_elem_per_block - 1) / output_elem_per_block;
+    block_cipher_kernel_cpu<block_size>(total,
+        cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
+  } else if (device.type() == at::kCUDA) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    const auto threads = 256;
+    const auto grid = (output_numel + (threads * output_elem_per_block) - 1) / (threads * output_elem_per_block);
+    auto stream = at::cuda::getCurrentCUDAStream();
+    block_cipher_kernel_cuda<block_size><<<grid, threads, 0, stream>>>(
+        cipher, output_elem_per_block,
+        input_ptr, input_numel, input_type_size, input_index_calc,
+        output_ptr, output_numel, output_type_size, output_index_calc,
+        transform_func
+    );
+    AT_CUDA_CHECK(cudaGetLastError());
+#else
+    TORCH_CHECK(false, "torchcsprng was compiled without CUDA support");
+#endif
+  } else {
+    TORCH_CHECK(false, "block_cipher supports only CPU and CUDA devices");
+  }
+}
+
+template<int block_size, typename cipher_t>
+void block_cipher(at::Tensor input, at::Tensor output, cipher_t cipher) {
+  const auto input_ptr = input.data_ptr();
+  const auto input_numel = input.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (input_ptr == nullptr || input_numel == 0) {
+    return;
+  }
+
+  const auto input_type_size = input.element_size();
+  const auto input_offset_calc = make_offset_calculator<1>(at::TensorIterator::nullary_op(input));
+  const auto input_index_calc = [input_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return input_offset_calc.get(li)[0];
+  };
+
+  const auto output_ptr = output.data_ptr();
+  const auto output_numel = output.numel();
+
+  // Otherwise OffsetCalculator/IntDivider crashes with integer division by zero
+  if (output_ptr == nullptr || output_numel == 0) {
+    return;
+  }
+
+  const auto output_type_size = output.element_size();
+  const auto output_offset_calc = make_offset_calculator<1>(at::TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+    return output_offset_calc.get(li)[0];
+  };
+
+  const auto device = output.device();
+
+  torch::csprng::block_cipher<block_size>(
+      input_ptr, input_numel, input_type_size, input_index_calc,
+      output_ptr, output_numel, output_type_size, output_index_calc,
+      device, cipher, block_size / output_type_size,
+      [] TORCH_CSPRNG_HOST_DEVICE (uint8_t* x) {});
+}
+
+}}
diff --git a/torchcsprng/csrc/cpu/kernels.cpp b/torchcsprng/csrc/cpu/kernels.cpp
new file mode 100644
index 0000000..5f86829
--- /dev/null
+++ b/torchcsprng/csrc/cpu/kernels.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../kernels_commons.h"
+
+namespace torch {
+namespace csprng {
+namespace cpu {
+
+#include "../kernels_body.inc"
+
+}}}
diff --git a/torchcsprng/csrc/cpu/kernels.h b/torchcsprng/csrc/cpu/kernels.h
new file mode 100644
index 0000000..f84af4a
--- /dev/null
+++ b/torchcsprng/csrc/cpu/kernels.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/Generator.h>
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace csprng {
+namespace cpu {
+
+#include "../kernels_decls.inc"
+
+}}}
diff --git a/torchcsprng/csrc/csprng.cpp b/torchcsprng/csrc/csprng.cpp
new file mode 100644
index 0000000..8494253
--- /dev/null
+++ b/torchcsprng/csrc/csprng.cpp
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include <ATen/Generator.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/op_registration/op_registration.h>
+
+#include "kernels_commons.h"
+#include "cpu/kernels.h"
+#ifdef WITH_CUDA
+#include "cuda/kernels.cuh"
+#endif
+
+using namespace at;
+using namespace torch::csprng;
+
+static const auto GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE = "generator does not support tensor device type";
+static const auto TENSOR_DEVICE_TYPE_IS_NOT_SUPPORTED = "tensor device type is not supported";
+
+// ==================================================== Random ========================================================
+
+Tensor& random_(Tensor& self, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::random_(self, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::random_(self, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor& random_from_to(Tensor& self, int64_t from, optional<int64_t> to,
+                       c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::random_from_to(self, from, to, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::random_from_to(self, from, to, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor& random_to(Tensor& self, int64_t to,
+                  c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::random_to(self, to, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::random_to(self, to, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ==================================================== Uniform =======================================================
+
+Tensor& uniform_(Tensor& self, double from, double to, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::uniform_(self, from, to, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::uniform_(self, from, to, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ==================================================== Normal ========================================================
+
+Tensor& normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::normal_(self, mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_(self, mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor& normal_Tensor_float_out(const Tensor& mean, double std, c10::optional<Generator> gen, Tensor& output) {
+  if (output.device().type() == DeviceType::CPU) {
+    return cpu::normal_Tensor_float_out(output, mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (output.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_Tensor_float_out(output, mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor& normal_float_Tensor_out(double mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  if (output.device().type() == DeviceType::CPU) {
+    return cpu::normal_float_Tensor_out(output, mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (output.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_float_Tensor_out(output, mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor& normal_Tensor_Tensor_out(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen, Tensor& output) {
+  if (output.device().type() == DeviceType::CPU) {
+    return cpu::normal_Tensor_Tensor_out(output, mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (output.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_Tensor_Tensor_out(output, mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor normal_Tensor_float(const Tensor& mean, double std, c10::optional<Generator> gen) {
+  if (mean.device().type() == DeviceType::CPU) {
+    return cpu::normal_Tensor_float(mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (mean.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_Tensor_float(mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor normal_float_Tensor(double mean, const Tensor& std, c10::optional<Generator> gen) {
+  if (std.device().type() == DeviceType::CPU) {
+    return cpu::normal_float_Tensor(mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (std.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_float_Tensor(mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+Tensor normal_Tensor_Tensor(const Tensor& mean, const Tensor& std, c10::optional<Generator> gen) {
+  if (mean.device().type() == DeviceType::CPU) {
+    return cpu::normal_Tensor_Tensor(mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (mean.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::normal_Tensor_Tensor(mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ==================================================== Cauchy ========================================================
+
+Tensor& cauchy_(Tensor& self, double median, double sigma, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::cauchy_(self, median, sigma, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::cauchy_(self, median, sigma, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ================================================== LogNormal =======================================================
+
+Tensor& log_normal_(Tensor& self, double mean, double std, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::log_normal_(self, mean, std, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::log_normal_(self, mean, std, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ================================================== Geometric =======================================================
+
+Tensor& geometric_(Tensor& self, double p, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::geometric_(self, p, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::geometric_(self, p, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// ================================================== Exponential =====================================================
+
+Tensor& exponential_(Tensor& self, double lambda, c10::optional<Generator> gen) {
+  if (self.device().type() == DeviceType::CPU) {
+    return cpu::exponential_(self, lambda, gen);
+#ifdef WITH_CUDA
+  } else if (self.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::exponential_(self, lambda, gen);
+#endif
+  } else {
+    TORCH_CHECK(false, GENERATOR_DOES_NOT_SUPPORT_TENSOR_DEVICE_TYPE);
+  }
+}
+
+// =============================================== Random permutation =================================================
+
+// randperm implementation was copied from PyTorch to unblock CSPRNG users, but ultimately CSPRNG must reuse
+// refactored randperm from PyTorch, see https://github.com/pytorch/pytorch/issues/43816
+
+namespace {
+
+  inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
+    TORCH_CHECK(at::scalar_tensor(n, tensor.options()).defined(),
+                "n is too large for result tensor type: '", tensor.toString(), "'");
+
+    // Ensure sufficient precision for floating point representation.
+    switch (tensor.scalar_type()) {
+      case at::ScalarType::Half:
+        TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
+        break;
+      case at::ScalarType::Float:
+        TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
+        break;
+      case at::ScalarType::Double:  // Unlikely to happen, but doesn't hurt to check
+        TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
+        break;
+      default:
+        break;
+    }
+  }
+
+  template <typename scalar_t, typename RNG>
+  void randperm(Tensor& result, int64_t n, c10::optional<at::Generator> generator) {
+    auto gen = at::check_generator<RNG>(generator);
+    scalar_t *r__data = result.data_ptr<scalar_t>();
+
+    result.resize_({n});
+    int64_t r__stride_0 = result.stride(0);
+
+    at::parallel_for(0, n, internal::GRAIN_SIZE,
+                     [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) {
+                       for(int64_t i = p_begin; i < p_end; i++)
+                         r__data[i*r__stride_0] = static_cast<scalar_t>(i);
+                     });
+
+    for(int64_t i = 0; i < n - 1; i++)
+    {
+      int64_t z = gen->random() % (n-i);
+      scalar_t sav = r__data[i*r__stride_0];
+      r__data[i*r__stride_0] = r__data[(z+i)*r__stride_0];
+      r__data[(z+i)*r__stride_0] = sav;
+    }
+  }
+} // namespace
+
+Tensor& randperm_generator_out(int64_t n, c10::optional<Generator> generator, Tensor& result) {
+  TORCH_CHECK(n >= 0, "n must be non-negative, got", n);
+  check_supported_max_int_with_precision(n, result);
+  if (result.device().type() == at::kCUDA) {
+    auto result_cpu = at::empty({n}, result.options().device(kCPU));
+    randperm_generator_out(n, generator, result_cpu);
+    result.resize_({n});
+    return result.copy_(result_cpu);
+  }
+  result.resize_({n});
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(generator->mutex());
+  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, result.scalar_type(), "randperm", [&]() -> void {
+    randperm<scalar_t, CSPRNGGeneratorImpl>(result, n, generator);
+  });
+  return result;
+}
+
+// ================================================Encrypt/Decrypt=====================================================
+
+Tensor encrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  if (input.device().type() == DeviceType::CPU) {
+    return cpu::encrypt(input, output, key, cipher, mode);
+#ifdef WITH_CUDA
+  } else if (input.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::encrypt(input, output, key, cipher, mode);
+#endif
+  } else {
+    TORCH_CHECK(false, TENSOR_DEVICE_TYPE_IS_NOT_SUPPORTED);
+  }
+}
+
+Tensor decrypt_pybind(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  if (input.device().type() == DeviceType::CPU) {
+    return cpu::decrypt(input, output, key, cipher, mode);
+#ifdef WITH_CUDA
+  } else if (input.device().type() == DeviceType::CUDA) {
+    return torch::csprng::cuda::decrypt(input, output, key, cipher, mode);
+#endif
+  } else {
+    TORCH_CHECK(false, TENSOR_DEVICE_TYPE_IS_NOT_SUPPORTED);
+  }
+}
+
+// ====================================================================================================================
+
+Generator create_random_device_generator(c10::optional<std::string> token = c10::nullopt) {
+  if (token.has_value()) {
+    return make_generator<CSPRNGGeneratorImpl>(*token);
+  } else {
+    return make_generator<CSPRNGGeneratorImpl>(true);
+  }
+}
+
+Generator create_mt19937_generator(c10::optional<uint64_t> seed = c10::nullopt) {
+  if (seed.has_value()) {
+    return make_generator<CSPRNGGeneratorImpl>(*seed);
+  } else {
+    return make_generator<CSPRNGGeneratorImpl>(false);
+  }
+}
+
+bool supports_cuda() {
+#ifdef WITH_CUDA
+  return true;
+#else
+  return false;
+#endif
+}
+
+TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
+  // Random
+  m.impl("random_.from",             random_from_to);
+  m.impl("random_.to",               random_to);
+  m.impl("random_",                  random_);
+  // Uniform
+  m.impl("uniform_",                 uniform_);
+  // Normal
+  m.impl("normal_",                  normal_);
+  m.impl("normal.Tensor_float_out",  normal_Tensor_float_out);
+  m.impl("normal.float_Tensor_out",  normal_float_Tensor_out);
+  m.impl("normal.Tensor_Tensor_out", normal_Tensor_Tensor_out);
+  m.impl("normal.Tensor_float",      normal_Tensor_float);
+  m.impl("normal.float_Tensor",      normal_float_Tensor);
+  m.impl("normal.Tensor_Tensor",     normal_Tensor_Tensor);
+  // Cauchy
+  m.impl("cauchy_",                  cauchy_);
+  // LogNormal
+  m.impl("log_normal_",              log_normal_);
+  // Geometric
+  m.impl("geometric_",               geometric_);
+  // Exponential
+  m.impl("exponential_",             exponential_);
+  // Random permutation
+  m.impl("randperm.generator_out",   randperm_generator_out);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("supports_cuda", &supports_cuda);
+  m.def("create_random_device_generator", &create_random_device_generator, py::arg("token") = nullptr);
+  m.def("create_mt19937_generator", &create_mt19937_generator, py::arg("seed") = nullptr);
+  m.def("encrypt", &encrypt_pybind);
+  m.def("decrypt", &decrypt_pybind);
+}
diff --git a/torchcsprng/csrc/cuda/kernels.cu b/torchcsprng/csrc/cuda/kernels.cu
new file mode 100644
index 0000000..6842ffb
--- /dev/null
+++ b/torchcsprng/csrc/cuda/kernels.cu
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../kernels_commons.h"
+
+namespace torch {
+namespace csprng {
+namespace cuda {
+
+#include "../kernels_body.inc"
+
+}}}
diff --git a/torchcsprng/csrc/cuda/kernels.cuh b/torchcsprng/csrc/cuda/kernels.cuh
new file mode 100644
index 0000000..b2a05d4
--- /dev/null
+++ b/torchcsprng/csrc/cuda/kernels.cuh
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/Generator.h>
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace csprng {
+namespace cuda {
+
+#include "../kernels_decls.inc"
+
+}}}
diff --git a/torchcsprng/csrc/kernels_body.inc b/torchcsprng/csrc/kernels_body.inc
new file mode 100644
index 0000000..a2be40d
--- /dev/null
+++ b/torchcsprng/csrc/kernels_body.inc
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "aes.inc"
+
+// Generates `block_t_size`-bytes random key Tensor on CPU
+// using `generator`, which must be an instance of `at::CPUGeneratorImpl`
+// and passes it to the `device`.
+template<typename RNG>
+at::Tensor key_tensor(size_t block_t_size, c10::optional<at::Generator> generator) {
+  std::lock_guard<std::mutex> lock(generator->mutex());
+  auto gen = at::check_generator<RNG>(generator);
+  auto key = torch::empty({static_cast<signed long>(block_t_size)}, torch::kUInt8);
+  using random_t = typename std::result_of<decltype(&RNG::random)(RNG)>::type;
+  constexpr size_t random_t_size = sizeof(random_t);
+  for (size_t i = 0; i < block_t_size / random_t_size; i++) {
+    const auto rand = gen->random();
+    for (size_t j = 0; j < random_t_size; j++) {
+      size_t k = i * random_t_size + j;
+      key[k] = static_cast<uint8_t>((rand >> (j * 8)) & 0xff);
+    }
+  }
+  return key;
+}
+
+template<typename RNG>
+at::Tensor aes128_key_tensor(at::Generator generator) {
+      return key_tensor<RNG>(aes::block_t_size, generator);
+}
+
+// ====================================================================================================================
+
+// A simple container for random state sub-blocks that implements RNG interface
+// with random() and random64() methods, that are used by transformation function
+template<size_t size>
+struct RNGValues {
+  TORCH_CSPRNG_HOST_DEVICE RNGValues(uint64_t* vals) {
+    memcpy(&vals_, vals, size * sizeof(uint64_t));
+  }
+  uint32_t TORCH_CSPRNG_HOST_DEVICE random() { auto res = static_cast<uint32_t>(vals_[index]); index++; return res; }
+  uint64_t TORCH_CSPRNG_HOST_DEVICE random64() { auto res = vals_[index]; index++; return res; }
+private:
+  uint64_t vals_[size];
+  int index = 0;
+};
+
+// Applies AES in CTR mode with the `key` for passed TensorIterator iter.
+// `scalar_t`       is a scalar type equivalent of target tensor dtype
+// `uint_t`         is an unsigned integral type of sub-blocks that random state is divided to
+//                  (e.g, 16 bytes random state block can be divided into 16 uint8_t sub-blocks
+//                  or 8 uint16_t sub-block or 4 uint32_t sub-block or 2 uint64_t sub-blocks)
+// `N`              is a number of sub-block which is used by `transform_func`
+//                  to generate a random value of specific distribution (e.g. `normal` uses 2)
+// `key`            is a CUDA pointer to random key memory block
+// `transform_func` is a callable that converts N `uint_t` random state sub-blocks passed in RNGValues into target dtype `scalar_t`
+template<typename scalar_t, typename uint_t, size_t N = 1, typename transform_t>
+void aes_helper(at::TensorIterator& iter, const uint8_t* key_bytes, transform_t transform_func) {
+  auto output = iter.tensor(0);
+  const auto output_offset_calc = make_offset_calculator<1>(at::TensorIterator::nullary_op(output));
+  const auto output_index_calc = [output_offset_calc] TORCH_CSPRNG_HOST_DEVICE (uint32_t li) -> uint32_t {
+      return output_offset_calc.get(li)[0];
+  };
+  torch::csprng::block_cipher<aes::block_t_size>(
+      nullptr, 0, 0, output_index_calc,
+      output.data_ptr(), output.numel(), output.element_size(), output_index_calc,
+      iter.device_type(),
+      [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+          uint8_t idx_block[aes::block_t_size];
+          std::memset(&idx_block, 0, aes::block_t_size);
+          *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+          aes::encrypt(idx_block, key_bytes);
+          for (size_t i = 0; i < aes::block_t_size; i++) {
+            block[i] ^= idx_block[i];
+          }
+      },
+      aes::block_t_size / (N * sizeof(uint_t)),
+  [transform_func] TORCH_CSPRNG_HOST_DEVICE (uint8_t* block) {
+    const auto n = aes::block_t_size / (N * sizeof(uint_t));
+    for (size_t i = 0; i < n; ++i) {
+      uint64_t vals[N];
+      for (size_t j = 0; j < N; ++j) {
+        vals[j] = (reinterpret_cast<uint_t*>(block))[N * i + j];
+      }
+      RNGValues<N> rng(vals);
+      reinterpret_cast<scalar_t*>(block)[i] = transform_func(&rng);
+    }
+  }
+  );
+}
+
+// ====================================================================================================================
+
+// A mapping between scalar type and corresponding unsigned integer type of random state sub-block.
+// uint64_t for double and long, uint32_t for the rest
+template <typename T>
+struct UIntType {};
+
+template <> struct UIntType<double> { using type = uint64_t; };
+template <> struct UIntType<float> { using type = uint32_t; };
+template <> struct UIntType<c10::Half> { using type = uint16_t; };
+template <> struct UIntType<c10::BFloat16> { using type = uint16_t; };
+template <> struct UIntType<int64_t> { using type = uint64_t; };
+template <> struct UIntType<int32_t> { using type = uint32_t; };
+template <> struct UIntType<int16_t> { using type = uint32_t; };
+template <> struct UIntType<int8_t> { using type = uint32_t; };
+template <> struct UIntType<uint8_t> { using type = uint32_t; };
+template <> struct UIntType<bool> { using type = uint32_t; };
+
+// ==================================================== Random ========================================================
+
+template<typename RNG>
+struct RandomKernel {
+  void operator()(TensorIterator& iter, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_kernel", [&] {
+      aes_helper<scalar_t, UIntType<scalar_t>::type>(iter, key,
+        [] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
+          uniform_int_distribution<scalar_t> random;
+          return random(generator);
+        }
+      );
+    });
+  }
+};
+
+template<typename scalar_t, typename uint_t>
+void random_from_to_kernel_helper(TensorIterator& iter, uint64_t range, int64_t base, const uint8_t* key) {
+  aes_helper<scalar_t, uint_t>(iter, key,
+    [range, base] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
+      uniform_int_from_to_distribution<scalar_t> random(range, base);
+      return random(generator);
+    }
+  );
+}
+
+template<typename scalar_t, typename uint_t>
+void random_full_range_kernel_helper(TensorIterator& iter, const uint8_t* key) {
+  aes_helper<scalar_t, uint_t>(iter, key,
+    [] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
+      uniform_int_full_range_distribution<scalar_t> random;
+      return random(generator);
+    }
+  );
+}
+
+template<typename RNG>
+struct RandomFromToKernel {
+  void operator()(TensorIterator& iter, uint64_t range, int64_t base, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_from_to_kernel", [&] {
+      if ((
+        std::is_same<scalar_t, int64_t>::value ||
+        std::is_same<scalar_t, double>::value ||
+        std::is_same<scalar_t, float>::value ||
+        std::is_same<scalar_t, at::BFloat16>::value)/* TODO: && range >= 1ULL << 32*/)
+      {
+        random_from_to_kernel_helper<scalar_t, uint64_t>(iter, range, base, key);
+      } else {
+        random_from_to_kernel_helper<scalar_t, uint32_t>(iter, range, base, key);
+      }
+    });
+  }
+  void operator()(TensorIterator& iter, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel", [&] {
+      if (std::is_same<scalar_t, int64_t>::value ||
+          std::is_same<scalar_t, double>::value ||
+          std::is_same<scalar_t, float>::value ||
+          std::is_same<scalar_t, at::BFloat16>::value)
+      {
+        random_full_range_kernel_helper<scalar_t, uint64_t>(iter, key);
+      } else {
+        TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16");
+      }
+    });
+  }
+};
+
+at::Tensor& random_(at::Tensor& self, c10::optional<at::Generator> generator) {
+  return at::native::templates::random_impl<RandomKernel, CSPRNGGeneratorImpl>(self, generator);
+}
+
+at::Tensor& random_from_to(at::Tensor& self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator) {
+  return at::native::templates::random_from_to_impl<RandomFromToKernel, CSPRNGGeneratorImpl>(self, from, to, generator);
+}
+
+at::Tensor& random_to(at::Tensor& self, int64_t to, c10::optional<at::Generator> generator) {
+  return random_from_to(self, 0, to, generator);
+}
+
+// ==================================================== Uniform =======================================================
+
+template<typename RNG>
+struct UniformKernel {
+  void operator()(TensorIterator& iter, double from, double to, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel", [&] {
+      aes_helper<scalar_t, uint64_t>(iter, key,
+        [from, to] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* generator) -> scalar_t {
+          uniform_real_distribution<double> uniform(from, to);
+          return static_cast<scalar_t>(uniform(generator));
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& uniform_(at::Tensor& self, double from, double to, c10::optional<at::Generator> generator) {
+  return at::native::templates::uniform_impl_<UniformKernel, CSPRNGGeneratorImpl>(self, from, to, generator);
+}
+
+// ==================================================== Normal ========================================================
+
+template<typename RNG>
+struct NormalKernel {
+  void operator()(Tensor& self, double mean, double std, c10::optional<Generator> generator) {
+    auto iter = TensorIterator::nullary_op(self);
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel", [&] {
+      aes_helper<scalar_t, uint64_t, 2>(iter, key,
+        [mean, std] TORCH_CSPRNG_HOST_DEVICE (RNGValues<2>* gen) -> scalar_t {
+          normal_distribution<double> normal(mean, std);
+          return static_cast<scalar_t>(normal(gen));
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& normal_(at::Tensor& self, double mean, double std, c10::optional<at::Generator> generator) {
+  return at::native::templates::normal_impl_<NormalKernel, CSPRNGGeneratorImpl>(self, mean, std, generator);
+}
+
+at::Tensor& normal_Tensor_float_out(at::Tensor& output, const at::Tensor& mean, double std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_out_impl<NormalKernel, CSPRNGGeneratorImpl>(output, mean, std, gen);
+}
+
+at::Tensor& normal_float_Tensor_out(at::Tensor& output, double mean, const at::Tensor& std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_out_impl<NormalKernel, CSPRNGGeneratorImpl>(output, mean, std, gen);
+}
+
+at::Tensor& normal_Tensor_Tensor_out(at::Tensor& output, const at::Tensor& mean, const at::Tensor& std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_out_impl<NormalKernel, CSPRNGGeneratorImpl>(output, mean, std, gen);
+}
+
+at::Tensor normal_Tensor_float(const at::Tensor& mean, double std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_impl<NormalKernel, CSPRNGGeneratorImpl>(mean, std, gen);
+}
+
+at::Tensor normal_float_Tensor(double mean, const at::Tensor& std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_impl<NormalKernel, CSPRNGGeneratorImpl>(mean, std, gen);
+}
+
+at::Tensor normal_Tensor_Tensor(const at::Tensor& mean, const at::Tensor& std, c10::optional<at::Generator> gen) {
+  return at::native::templates::normal_impl<NormalKernel, CSPRNGGeneratorImpl>(mean, std, gen);
+}
+
+// ==================================================== Cauchy ========================================================
+
+template<typename RNG>
+struct CauchyKernel {
+  void operator()(TensorIterator& iter, double median, double sigma, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_kernel", [&] {
+      aes_helper<scalar_t, uint64_t, 1>(iter, key,
+        [median, sigma] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
+          cauchy_distribution<double> cauchy(median, sigma);
+          return static_cast<scalar_t>(cauchy(gen));
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& cauchy_(at::Tensor& self, double median, double sigma, c10::optional<at::Generator> generator) {
+  return at::native::templates::cauchy_impl_<CauchyKernel, CSPRNGGeneratorImpl>(self, median, sigma, generator);
+}
+
+// ================================================== LogNormal =======================================================
+
+template<typename RNG>
+struct LogNormalKernel {
+  void operator()(TensorIterator& iter, double mean, double std, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal", [&] {
+      aes_helper<scalar_t, uint64_t, 2>(iter, key,
+        [mean, std] TORCH_CSPRNG_HOST_DEVICE (RNGValues<2>* gen) -> scalar_t {
+          lognormal_distribution<double> logNormal(mean, std);
+          return static_cast<scalar_t>(logNormal(gen));
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& log_normal_(at::Tensor& self, double mean, double std, c10::optional<at::Generator> gen) {
+  return at::native::templates::log_normal_impl_<LogNormalKernel, CSPRNGGeneratorImpl>(self, mean, std, gen);
+}
+
+// ================================================== Geometric =======================================================
+
+template<typename RNG>
+struct GeometricKernel {
+  void operator()(TensorIterator& iter, double p, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_kernel", [&] {
+      aes_helper<scalar_t, UIntType<scalar_t>::type, 1>(iter, key,
+        [p] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
+          geometric_distribution<scalar_t> geometric(p);
+          return geometric(gen);
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& geometric_(at::Tensor& self, double p, c10::optional<at::Generator> gen) {
+  return at::native::templates::geometric_impl_<GeometricKernel, CSPRNGGeneratorImpl>(self, p, gen);
+}
+
+// ================================================== Exponential =====================================================
+
+template<typename RNG>
+struct ExponentialKernel {
+  void operator()(TensorIterator& iter, double lambda, c10::optional<Generator> generator) {
+    const Tensor key_t = aes128_key_tensor<RNG>(*generator).to(iter.device());
+    const auto key = key_t.data_ptr<uint8_t>();
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_kernel", [&] {
+      aes_helper<scalar_t, uint64_t, 1>(iter, key,
+        [lambda] TORCH_CSPRNG_HOST_DEVICE (RNGValues<1>* gen) -> scalar_t {
+          exponential_distribution<double> exponential(lambda);
+          return static_cast<scalar_t>(exponential(gen));
+        }
+      );
+    });
+  }
+};
+
+at::Tensor& exponential_(at::Tensor& self, double lambda, c10::optional<at::Generator> gen) {
+  return at::native::templates::exponential_impl_<ExponentialKernel, CSPRNGGeneratorImpl>(self, lambda, gen);
+}
+
+// ================================================Encrypt/Decrypt=====================================================
+
+void check_cipher(const std::string& cipher, Tensor key) {
+  if (cipher == "aes128") {
+    TORCH_CHECK(key.element_size() * key.numel() == 16, "key tensor must have 16 bytes(128 bits)");
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"aes128\" cipher, \"", cipher, "\" is not supported.");
+  }
+}
+
+void aes_ecb_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::encrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ecb_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      aes::decrypt(block, key_bytes);
+    }
+  );
+}
+
+void aes_ctr_encrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  block_cipher<aes::block_t_size>(input, output,
+    [key_bytes] TORCH_CSPRNG_HOST_DEVICE (int64_t idx, uint8_t* block) -> void {
+      uint8_t idx_block[aes::block_t_size];
+      std::memset(&idx_block, 0, aes::block_t_size);
+      *(reinterpret_cast<int64_t*>(idx_block)) = idx;
+      aes::encrypt(idx_block, key_bytes);
+      for (size_t i = 0; i < aes::block_t_size; i++) {
+        block[i] ^= idx_block[i];
+      }
+    }
+  );
+}
+
+void aes_ctr_decrypt(Tensor input, Tensor output, uint8_t* key_bytes) {
+  aes_ctr_encrypt(input, output, key_bytes);
+}
+
+Tensor encrypt(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  const auto input_size_bytes_rounded = (input_size_bytes + aes::block_t_size - 1) / aes::block_t_size * aes::block_t_size;
+  TORCH_CHECK(output_size_bytes == input_size_bytes_rounded,
+              "output size in bytes(", output_size_bytes,
+              ") is not equal to input size in bytes rounded to block size(",
+              input_size_bytes_rounded, ")");
+  check_cipher(cipher, key);
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    aes_ecb_encrypt(input, output, key_bytes);
+  } else if (mode == "ctr") {
+    aes_ctr_encrypt(input, output, key_bytes);
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
+
+Tensor decrypt(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode) {
+  TORCH_CHECK(input.device() == output.device() && input.device() == key.device(), "input, output and key tensors must have the same device");
+  const auto output_size_bytes = output.numel() * output.itemsize();
+  const auto input_size_bytes = input.numel() * input.itemsize();
+  const auto diff = input_size_bytes - output_size_bytes;
+  TORCH_CHECK(0 <= diff && diff < aes::block_t_size, "output tensor size in bytes must be less then or equal to input tensor size in bytes, the difference must be less than block size");
+  TORCH_CHECK(input_size_bytes % aes::block_t_size == 0, "input tensor size in bytes must divisible by cipher block size in bytes");
+  check_cipher(cipher, key);
+  const auto key_bytes = reinterpret_cast<uint8_t*>(key.contiguous().data_ptr());
+  if (mode == "ecb") {
+    aes_ecb_decrypt(input, output, key_bytes);
+  } else if (mode == "ctr") {
+    aes_ctr_decrypt(input, output, key_bytes);
+  } else {
+    TORCH_CHECK(false, "encrypt/decrypt supports \"ecb\" and \"ctr\" modes, \"", mode, "\" is not supported.");
+  }
+  return output;
+}
diff --git a/torchcsprng/csrc/kernels_commons.h b/torchcsprng/csrc/kernels_commons.h
new file mode 100644
index 0000000..f4021a7
--- /dev/null
+++ b/torchcsprng/csrc/kernels_commons.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <random>
+#include <ATen/Generator.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/DistributionsHelper.h>
+#include <ATen/native/DistributionTemplates.h>
+#include <torch/utils.h>
+#include "macros.h"
+#include "block_cipher.h"
+
+inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
+  return (static_cast<uint64_t>(hi) << 32) | lo;
+}
+
+// CUDA CSPRNG is actually CPU generator which is used only to generate a random key on CPU for AES running in a block mode on CUDA
+struct CSPRNGGeneratorImpl : public c10::GeneratorImpl {
+  CSPRNGGeneratorImpl(bool use_rd)              : c10::GeneratorImpl{at::Device(at::DeviceType::CPU), at::DispatchKeySet(at::DispatchKey::CustomRNGKeyId)}, use_rd_{use_rd} {}
+  CSPRNGGeneratorImpl(const std::string& token) : c10::GeneratorImpl{at::Device(at::DeviceType::CPU), at::DispatchKeySet(at::DispatchKey::CustomRNGKeyId)}, use_rd_{true}, rd_{token} {}
+  CSPRNGGeneratorImpl(uint64_t seed)            : c10::GeneratorImpl{at::Device(at::DeviceType::CPU), at::DispatchKeySet(at::DispatchKey::CustomRNGKeyId)}, use_rd_{false}, mt_{static_cast<unsigned int>(seed)} { }
+  ~CSPRNGGeneratorImpl() = default;
+  uint32_t random() { return use_rd_ ? rd_() : mt_(); }
+  uint64_t random64() { return use_rd_ ? make64BitsFrom32Bits(rd_(), rd_()) : make64BitsFrom32Bits(mt_(), mt_()); }
+
+  void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
+  uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
+  uint64_t seed() override { throw std::runtime_error("not implemented"); }
+  CSPRNGGeneratorImpl* clone_impl() const override { throw std::runtime_error("not implemented"); }
+
+  static at::DeviceType device_type() { return at::DeviceType::CPU; }
+
+  void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); }
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override { throw std::runtime_error("not implemented"); }
+
+  bool use_rd_;
+  std::random_device rd_;
+  std::mt19937 mt_;
+};
diff --git a/torchcsprng/csrc/kernels_decls.inc b/torchcsprng/csrc/kernels_decls.inc
new file mode 100644
index 0000000..d07aa09
--- /dev/null
+++ b/torchcsprng/csrc/kernels_decls.inc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ==================================================== Random ========================================================
+
+at::Tensor& random_(at::Tensor& self, c10::optional<at::Generator> generator);
+
+at::Tensor& random_from_to(at::Tensor& self, int64_t from, optional<int64_t> to, c10::optional<at::Generator> generator);
+
+at::Tensor& random_to(at::Tensor& self, int64_t to, c10::optional<at::Generator> generator);
+
+// ==================================================== Uniform =======================================================
+
+at::Tensor& uniform_(at::Tensor& self, double from, double to, c10::optional<at::Generator> generator);
+
+// ==================================================== Normal ========================================================
+
+at::Tensor& normal_(at::Tensor& self, double mean, double std, c10::optional<at::Generator> generator);
+
+at::Tensor& normal_Tensor_float_out(at::Tensor& output, const at::Tensor& mean, double std, c10::optional<at::Generator> gen);
+
+at::Tensor& normal_float_Tensor_out(at::Tensor& output, double mean, const at::Tensor& std, c10::optional<at::Generator> gen);
+
+at::Tensor& normal_Tensor_Tensor_out(at::Tensor& output, const at::Tensor& mean, const at::Tensor& std, c10::optional<at::Generator> gen);
+
+at::Tensor normal_Tensor_float(const at::Tensor& mean, double std, c10::optional<at::Generator> gen);
+
+at::Tensor normal_float_Tensor(double mean, const at::Tensor& std, c10::optional<at::Generator> gen);
+
+at::Tensor normal_Tensor_Tensor(const at::Tensor& mean, const at::Tensor& std, c10::optional<at::Generator> gen);
+
+// ==================================================== Cauchy ========================================================
+
+at::Tensor& cauchy_(at::Tensor& self, double median, double sigma, c10::optional<at::Generator> generator);
+
+// ================================================== LogNormal =======================================================
+
+at::Tensor& log_normal_(at::Tensor& self, double mean, double std, c10::optional<at::Generator> gen);
+
+// ================================================== Geometric =======================================================
+
+at::Tensor& geometric_(at::Tensor& self, double p, c10::optional<at::Generator> gen);
+
+// ================================================== Exponential =====================================================
+
+at::Tensor& exponential_(at::Tensor& self, double lambda, c10::optional<at::Generator> gen);
+
+// ================================================Encrypt/Decrypt=====================================================
+
+Tensor encrypt(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode);
+
+Tensor decrypt(Tensor input, Tensor output, Tensor key, const std::string& cipher, const std::string& mode);
diff --git a/torch_csprng/csrc/macros.h b/torchcsprng/csrc/macros.h
similarity index 51%
rename from torch_csprng/csrc/macros.h
rename to torchcsprng/csrc/macros.h
index aaa6b67..d21b25c 100644
--- a/torch_csprng/csrc/macros.h
+++ b/torchcsprng/csrc/macros.h
@@ -1,3 +1,10 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and its affiliates. All Rights Reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #pragma once
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
diff --git a/version.txt b/version.txt
index 5d192a8..c181bf5 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.1.0a0
\ No newline at end of file
+0.3.0a0