diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000000..5ace4600a1
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,6 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/labeler.yml b/.github/labeler.yml
index 8d13d71446..839d7f36c0 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2023-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,5 @@
 # limitations under the License.
 
 allocator:
-- 'src/tbbmalloc/**/*'
-- 'src/tbbmalloc_proxy/**/*'
-- 'test/tbbmalloc/**/*'
+- changed-files:
+  - any-glob-to-any-file: ['src/tbbmalloc/**/*', 'src/tbbmalloc_proxy/**/*', 'test/tbbmalloc/**/*']
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a65de62241..abda1e140f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -37,7 +37,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           sudo apt update && sudo apt install -y codespell
@@ -47,7 +47,7 @@ jobs:
     runs-on: [ubuntu-20.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run scan
         run: |
           command -v clang-format-10
@@ -62,7 +62,7 @@ jobs:
     runs-on: [ubuntu-22.04]
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install prerequisites
         run: |
           pip3 install -U Jinja2
@@ -75,7 +75,7 @@ jobs:
           export BUILD_TYPE=${BUILD_TYPE} && sphinx-build doc html
           tar -czvf html.tar.gz html/
       - name: Save docs
-        uses: actions/upload-artifact@v2.2.1
+        uses: actions/upload-artifact@v4
         with:
           name: oneTBB-html-docs-${{ env.GITHUB_SHA_SHORT }}
           path: html.tar.gz
@@ -90,14 +90,14 @@ jobs:
     needs: [documentation]
     steps:
       - name: Checkout gh-pages
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: gh-pages
           path: gh-pages
       - name: Set env
         run: echo GITHUB_SHA_SHORT=${GITHUB_SHA::8} >> $GITHUB_ENV
       - name: Download documetation
-        uses: actions/download-artifact@v2
+        uses: actions/download-artifact@v4
         with:
           name: oneTBB-html-docs-${{ env.GITHUB_SHA_SHORT }}
       - name: Publish to github pages
@@ -117,7 +117,7 @@ jobs:
     if: ${{ github.ref != 'refs/heads/master' }}
     runs-on: [ubuntu-20.04]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Run check
@@ -137,7 +137,7 @@ jobs:
     runs-on: [ubuntu-latest]
     timeout-minutes: 15
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build && cd build
@@ -179,7 +179,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -198,7 +198,7 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - os: macos-12
+          - os: macos-14
             c_compiler: clang
             cxx_compiler: clang++
             std: 14
@@ -212,7 +212,7 @@ jobs:
             preview: 'ON'
             cmake_static: -DBUILD_SHARED_LIBS=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -257,7 +257,7 @@ jobs:
             preview: 'OFF'
             job_name: windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
@@ -295,7 +295,7 @@ jobs:
             build_type: debug
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -314,14 +314,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - os: macos-12
+          - os: macos-15
             c_compiler: clang
             cxx_compiler: clang++
             std: 14
             build_type: relwithdebinfo
             preview: 'ON'
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         shell: bash
         run: |
@@ -357,7 +357,7 @@ jobs:
             preview: 'OFF'
             job_name: examples_windows_cl2022_cxx17_relwithdebinfo_preview=OFF
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run testing
         run: |
           mkdir build
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 7a80c5f0e2..28d3ff62b1 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -46,7 +46,7 @@ jobs:
     
     steps:
       - name: Harden Runner
-        uses: step-security/harden-runner@v2.6.1
+        uses: step-security/harden-runner@v2.10.2
         with:
           egress-policy: audit
           
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
new file mode 100644
index 0000000000..b9a01bb069
--- /dev/null
+++ b/.github/workflows/coverity.yml
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Coverity Scan
+
+on:
+
+# Only run on push to master branch
+  push:
+    branches: [master]
+
+permissions: read-all
+
+env:
+  BUILD_CONCURRENCY: 4
+  COVERITY_PROJECT: oneapi-src%2FoneTBB
+
+jobs:
+  coverity_linux:
+    name: Coverity Linux
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download Linux 64 Coverity Tool
+        run: |
+          curl https://scan.coverity.com/download/cxx/linux64 --output ${GITHUB_WORKSPACE}/cov-linux64-tool.tar.gz \
+            --data "token=${{secrets.COVERITY_TOKEN}}&project=${{env.COVERITY_PROJECT}}"
+          mkdir cov-linux64-tool
+          tar -xzf cov-linux64-tool.tar.gz --strip 1 -C cov-linux64-tool
+      - name: Build with cov-build
+        run: |
+          export PATH="${PWD}/cov-linux64-tool/bin:${PATH}"
+          mkdir build && cd build
+          cmake -DCMAKE_CXX_STANDARD=20 -DCMAKE_BUILD_TYPE=relwithdebinfo \
+            -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DTBB_CPF=ON ..
+          cov-build --dir cov-int make VERBOSE=1 -j${{env.BUILD_CONCURRENCY}}
+      - name: Archive Coverity build results
+        run: |
+          cd build
+          tar -czvf cov-int.tgz cov-int
+      - name: Submit Coverity results for analysis
+        run: |
+          cd build
+          curl \
+            --form token="${{ secrets.COVERITY_TOKEN }}" \
+            --form email="${{ secrets.COVERITY_EMAIL }}" \
+            --form file=@cov-int.tgz \
+            --form version="${GITHUB_SHA}" \
+            --form description="" \
+              "https://scan.coverity.com/builds?project=${{env.COVERITY_PROJECT}}"
diff --git a/.github/workflows/issue_labeler.yml b/.github/workflows/issue_labeler.yml
index 80591aa974..1f8e9f78bc 100644
--- a/.github/workflows/issue_labeler.yml
+++ b/.github/workflows/issue_labeler.yml
@@ -29,9 +29,8 @@ jobs:
       issues: write
       contents: read
     steps:
-    - uses: github/issue-labeler@v3.2 #May not be the latest version
+    - uses: github/issue-labeler@v3.4 #May not be the latest version
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: .github/issue_labeler.yml
         enable-versioned-regex: 0
-        sync-labels: 1
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 36812ebd62..770c211780 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -24,5 +24,7 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@v4
-
+    - uses: actions/checkout@v4
+    - uses: actions/labeler@v5
+      with:
+        configuration-path: .github/labeler.yml
diff --git a/.github/workflows/ossf-scorecard.yml b/.github/workflows/ossf-scorecard.yml
index 9f45569f8a..e03dbb750a 100644
--- a/.github/workflows/ossf-scorecard.yml
+++ b/.github/workflows/ossf-scorecard.yml
@@ -47,7 +47,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@v2.3.1
+        uses: ossf/scorecard-action@v2.4.0
         with:
           results_file: results.sarif
           results_format: sarif
diff --git a/BUILD.bazel b/BUILD.bazel
index 9073f4640d..7c479ca3ed 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -37,6 +37,7 @@ cc_library(
     ]),
     copts = ["-w"] + select({
         "@platforms//os:windows": [""],
+        "@platforms//cpu:arm64": [""],
         "//conditions:default": ["-mwaitpkg"],
     }),
     defines =
diff --git a/Bazel.md b/Bazel.md
index 09a630a72b..13b112ee81 100644
--- a/Bazel.md
+++ b/Bazel.md
@@ -40,7 +40,7 @@ load("@platforms//tools/build_defs/repo:git.bzl", "git_repository")
 git_repository(
     name = "oneTBB",
     branch = "master",
-    remote = "https://github.com/oneapi-src/oneTBB/",
+    remote = "https://github.com/uxlfoundation/oneTBB/",
 )
 ```
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 19232a9920..0416364300 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,12 @@ cmake_minimum_required(VERSION 3.5)
 
 # Enable CMake policies
 
+if (POLICY CMP0063)
+    # The NEW behavior for this policy is to honor the visibility properties for all target types.
+    cmake_policy(SET CMP0063 NEW)
+    set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
+endif()
+
 if (POLICY CMP0068)
     # RPATH settings do not affect install_name on macOS since CMake 3.9
     cmake_policy(SET CMP0068 NEW)
@@ -49,7 +55,7 @@ string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_min
 string(REGEX REPLACE ".*#define TBB_VERSION_PATCH ([0-9]+).*" "\\1" _tbb_ver_patch "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}")
-set(TBB_BINARY_MINOR_VERSION ${_tbb_ver_minor})
+string(REGEX REPLACE "..(..)." "\\1" TBB_BINARY_MINOR_VERSION "${TBB_INTERFACE_VERSION}")
 set(TBBMALLOC_BINARY_VERSION 2)
 set(TBBBIND_BINARY_VERSION 3)
 
@@ -84,6 +90,11 @@ endif()
 
 set(CMAKE_CXX_EXTENSIONS OFF) # use -std=c++... instead of -std=gnu++...
 # ---------------------------------------------------------------------------------------------------------
+# Setup symbol visibility properties.
+
+set(CMAKE_VISIBILITY_INLINES_HIDDEN TRUE)
+set(CMAKE_CXX_VISIBILITY_PRESET "hidden")
+# ---------------------------------------------------------------------------------------------------------
 
 # Detect architecture (bitness).
 if (CMAKE_SIZEOF_VOID_P EQUAL 4)
@@ -105,8 +116,13 @@ option(TBB_CPF "Enable preview features of the library" OFF)
 option(TBB_FIND_PACKAGE "Enable search for external oneTBB using find_package instead of build from sources" OFF)
 option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" ${CMAKE_CROSSCOMPILING})
 option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON)
+option(TBB_CONTROL_FLOW_GUARD "Enable Control Flow Guard (CFG) during the compilation" OFF)
 option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF)
 option(TBB_INSTALL "Enable installation" ON)
+option(TBB_FILE_TRIM "Enable __FILE__ trim" ON)
+if(LINUX)
+option(TBB_LINUX_SEPARATE_DBG "Enable separation of the debug symbols during the build" OFF)
+endif()
 if(APPLE)
 option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF)
 endif()
@@ -226,6 +242,16 @@ if (TBB_ENABLE_IPO AND BUILD_SHARED_LIBS AND NOT ANDROID_PLATFORM AND NOT TBB_SA
     endif()
 endif()
 
+if (TBB_FILE_TRIM)
+    file(RELATIVE_PATH TBB_RELATIVE_BIN_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR})
+    file(TO_NATIVE_PATH ${CMAKE_SOURCE_DIR} NATIVE_TBB_PROJECT_ROOT_DIR)
+    file(TO_NATIVE_PATH ${TBB_RELATIVE_BIN_PATH} NATIVE_TBB_RELATIVE_BIN_PATH)
+endif ()
+
+if (TBB_CONTROL_FLOW_GUARD)
+    message(STATUS "Control Flow Guard (CFG) enabled")
+endif()
+
 set(TBB_COMPILER_SETTINGS_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compilers/${CMAKE_CXX_COMPILER_ID}.cmake)
 if (EXISTS ${TBB_COMPILER_SETTINGS_FILE})
     include(${TBB_COMPILER_SETTINGS_FILE})
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b2b6a968cd..3414b9eaf6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,17 +19,17 @@ As an open source project, we welcome community contributions to oneAPI Threadin
 
 ## Licensing 
 
-Licensing is very important to open source projects. It helps ensure the software continues to be available under the terms that the author desired. The oneTBB project uses the [Apache 2.0 License](https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt), a permissive open source license that allows you to freely use, modify, and distribute your own products that include Apache 2.0 licensed software. By contributing to the oneTBB project, you agree to the license and copyright terms therein and release your own contributions under these terms. 
+Licensing is very important to open source projects. It helps ensure the software continues to be available under the terms that the author desired. The oneTBB project uses the [Apache 2.0 License](https://github.com/uxlfoundation/oneTBB/blob/master/LICENSE.txt), a permissive open source license that allows you to freely use, modify, and distribute your own products that include Apache 2.0 licensed software. By contributing to the oneTBB project, you agree to the license and copyright terms therein and release your own contributions under these terms. 
 
-Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/oneapi-src/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
+Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/uxlfoundation/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. 
 
 ## Prerequisites 
 
-As a contributor, you’ll want to be familiar with the oneTBB project and the repository layout. You should also know how to use it as explained in the [oneTBB documentation](https://oneapi-src.github.io/oneTBB/) and how to set up your build development environment to configure, build, and test oneTBB as explained in the [oneTBB Build System Description](cmake/README.md). 
+As a contributor, you'll want to be familiar with the oneTBB project and the repository layout. You should also know how to use it as explained in the [oneTBB documentation](https://uxlfoundation.github.io/oneTBB/) and how to set up your build development environment to configure, build, and test oneTBB as explained in the [oneTBB Build System Description](cmake/README.md). 
 
 ## Pull Requests 
 
-You can find all [open oneTBB pull requests](https://github.com/oneapi-src/oneTBB/pulls) on GitHub. 
+You can find all [open oneTBB pull requests](https://github.com/uxlfoundation/oneTBB/pulls) on GitHub. 
  
 ### Before contributing changes directly to the oneTBB repository
 
diff --git a/INSTALL.md b/INSTALL.md
index 0ac95f8755..c33a2c7293 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -61,7 +61,7 @@ You can use the ``install`` components for partial installation.
 The following install components are supported:
 - `runtime` - oneTBB runtime package (core shared libraries and `.dll` files on Windows* OS).
 - `devel` - oneTBB development package (header files, CMake integration files, library symbolic links, and `.lib` files on Windows* OS).
-- `tbb4py` - [oneTBB Module for Python](https://github.com/oneapi-src/oneTBB/blob/master/python/README.md).
+- `tbb4py` - [oneTBB Module for Python](https://github.com/uxlfoundation/oneTBB/blob/master/python/README.md).
 
 If you want to install specific components after configuration and build, run:
 
@@ -99,7 +99,7 @@ The following example demonstrates how to install oneTBB for single-configuratio
 # Do our experiments in /tmp
 cd /tmp
 # Clone oneTBB repository
-git clone https://github.com/oneapi-src/oneTBB.git
+git clone https://github.com/uxlfoundation/oneTBB.git
 cd oneTBB
 # Create binary directory for out-of-source build
 mkdir build && cd build
@@ -121,7 +121,7 @@ Choose the configuration during the build and install steps:
 REM Do our experiments in %TMP%
 cd %TMP%
 REM Clone oneTBB repository
-git clone https://github.com/oneapi-src/oneTBB.git
+git clone https://github.com/uxlfoundation/oneTBB.git
 cd oneTBB
 REM Create binary directory for out-of-source build
 mkdir build && cd build
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
new file mode 100644
index 0000000000..99c0a1dc92
--- /dev/null
+++ b/MAINTAINERS.md
@@ -0,0 +1,143 @@
+<!--
+******************************************************************************
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/-->
+
+# Introduction
+
+This document defines roles in the oneTBB project.
+
+# Roles and Responsibilities
+
+oneTBB project defines three main roles:
+ * [Contributor](#contributor)
+ * [Code Owner](#code-Owner)
+ * [Maintainer](#maintainer)
+
+[permissions]: https://docs.github.com/en/organizations/managing-user-access-to-your-organizations-repositories/managing-repository-roles/repository-roles-for-an-organization#permissions-for-each-role
+
+|                                                                                                                                             |       Contributor       |       Code Owner        |       Maintainer        |
+| :------------------------------------------------------------------------------------------------------------------------------------------ | :---------------------: | :---------------------: | :---------------------: |
+| _Responsibilities_                                                                                                                          |                         |                         |                         |
+| Follow the [Code of Conduct](./CODE_OF_CONDUCT.md)                                                                                          |            ✓            |            ✓           |            ✓            |
+| Follow [Contribution Guidelines](./CONTRIBUTING.md)                                                                                         |            ✓            |            ✓           |            ✓            |
+| Ensure [Contribution Guidelines](./CONTRIBUTING.md) are followed                                                                            |            ✗            |            ✓           |            ✓            |
+| Co-own component or aspect of the library,<br>  including contributing: bug fixes, implementing features,<br> and performance optimizations |            ✗            |            ✓           |            ✓            |
+| Co-own on technical direction of component or<br> aspect of the library including work on RFCs                                              |            ✗            |            ✓           |            ✓            |
+| Co-own the project as a whole,<br> including determining strategy and policy for the project                                                |            ✗            |            ✗           |            ✓            |
+| _Privileges_                                                                                                                                |                         |                         |                         |
+| Permission granted                                                                                                                          |   [Read][permissions]   |   [Write][permissions]  | [Maintain][permissions] |
+| Eligible to become                                                                                                                          |       Code Owner        |       Maintainer        |            ✗            |
+| Can recommend Contributors<br> to become Code Owner                                                                                         |            ✗            |            ✓           |            ✓            |
+| Can participate in promotions of<br> Code Owners and  Maintainers                                                                           |            ✗            |            ✗           |            ✓            |
+| Can suggest Milestones during planning                                                                                                      |            ✓            |            ✓           |            ✓            |
+| Can choose Milestones for specific component                                                                                                |            ✗            |            ✓           |            ✓            |
+| Make a decision on project's Milestones during planning                                                                                     |            ✗            |            ✗           |            ✓            |
+| Can propose new RFC or<br> participate in review of existing RFC                                                                            |            ✓            |            ✓           |            ✓            |
+| Can request rework of RFCs<br> in represented area of responsibility                                                                        |            ✗            |            ✓           |            ✓            |
+| Can request rework of RFCs<br> in any part of the project                                                                                   |            ✗            |            ✗           |            ✓            |
+| Can manage release process of the project                                                                                                   |            ✗            |            ✗           |            ✓            |
+| Can represent the project in public as a Maintainer                                                                                         |            ✗            |            ✗           |            ✓            |
+
+These roles are merit based. Refer to the corresponding section for specific
+requirements and the nomination process.
+
+## Contributor
+
+A Contributor invests time and resources to improve oneTBB project.
+Anyone can become a Contributor by bringing value in any following way:
+  * Answer questions from community members.
+  * Propose changes to the design.
+  * Provide feedback on design proposals.
+  * Review and/or test pull requests.
+  * Test releases and report bugs.
+  * Contribute code, including bug fixes, features implementations,
+and performance optimizations.
+
+## Code Owner
+
+A Code Owner has responsibility for a specific project component or a functional
+area. Code Owners are collectively responsible
+for developing and maintaining their component or functional areas, including
+reviewing all changes to corresponding areas of responsibility and indicating
+whether those changes are ready to be merged. Code Owners have a track record of
+contribution and review in the project.
+
+**Requirements:**
+  * Track record of accepted code contributions to a specific project component.
+  * Track record of contributions to the code review process.
+  * Demonstrate in-depth knowledge of the architecture of a specific project
+    component.
+  * Commit to being responsible for that specific area.
+
+How to become a Code Owner?
+1. A Contributor is nominated by opening a PR modifying the MAINTAINERS.md file
+including name, Github username, and affiliation.
+2. At least two specific component Maintainers approve the PR.
+3. [CODEOWNERS](./CODEOWNERS) file is updated to represent corresponding areas of responsibility.
+
+## Maintainer
+Maintainers are the most established contributors responsible for the 
+project technical direction. They participate in making decisions about the
+strategy and priorities of the project.
+
+**Requirements:**
+  * Have experience as a Code Owner.
+  * Track record of major project contributions to a specific project component.
+  * Demonstrate deep knowledge of a specific project component.
+  * Demonstrate broad knowledge of the project across multiple areas.
+  * Commit to using privileges responsibly for the good of the project.
+  * Be able to exercise judgment for the good of the project, independent of
+    their employer, friends, or team.
+
+Process of becoming a maintainer:
+1. A Maintainer may nominate a current code owner to become a new Maintainer by 
+opening a PR against MAINTAINERS.md file.
+2. A majority of the current Maintainers must then approve the PR.
+
+# Code Owners and Maintainers List
+
+## oneTBB core (API, Architecture, Tests)
+
+| Name                  | Github ID             | Affiliation       | Role       |
+| --------------------- | --------------------- | ----------------- | ---------- |
+| Ilya Isaev            | @isaevil              | Intel Corporation | Code Owner |
+| Sarath Nandu R        | @sarathnandu          | Intel Corporation | Code Owner |
+| Dmitri Mokhov         | @dnmokhov             | Intel Corporation | Code Owner |
+| Alexey Kukanov        | @akukanov             | Intel Corporation | Code Owner |
+| Konstantin Boyarinov  | @kboyarinov           | Intel Corporation | Maintainer |
+| Aleksei Fedotov       | @aleksei-fedotov      | Intel Corporation | Maintainer |
+| Michael Voss          | @vossmjp              | Intel Corporation | Maintainer |
+| Pavel Kumbrasev       | @pavelkumbrasev       | Intel Corporation | Maintainer |
+
+## oneTBB TBBMALLOC (API, Architecture, Tests)
+
+| Name                  | Github ID             | Affiliation       | Role       |
+| --------------------- | --------------------- | ----------------- | ---------- |
+| Łukasz Plewa          | @lplewa               | Intel Corporation | Maintainer |
+
+
+## oneTBB Documentation
+
+| Name                   | Github ID             | Affiliation       | Role       |
+| ---------------------- | --------------------- | ----------------- | ---------- |
+| Alexandra Epanchinzeva | @aepanchi             | Intel Corporation | Code Owner |
+
+
+## oneTBB Release Management
+
+| Name               | Github ID             | Affiliation       | Role       |
+| ------------------ | --------------------- | ----------------- | ---------- |
+| Olga Malysheva     | @omalyshe             | Intel Corporation | Maintainer |
+
diff --git a/README.md b/README.md
index 2e7c2e81ba..6f0b7d5922 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,11 @@
 # oneAPI Threading Building Blocks (oneTBB) <img align="right" width="200" height="100" src="https://raw.githubusercontent.com/uxlfoundation/artwork/e98f1a7a3d305c582d02c5f532e41487b710d470/foundation/uxl-foundation-logo-horizontal-color.svg">
-[![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt) [![oneTBB CI](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml/badge.svg)](https://github.com/oneapi-src/oneTBB/actions/workflows/ci.yml?query=branch%3Amaster)
-[![Join the community on GitHub Discussions](https://badgen.net/badge/join%20the%20discussion/on%20github/blue?icon=github)](https://github.com/oneapi-src/oneTBB/discussions)
+[![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt) [![oneTBB CI](https://github.com/uxlfoundation/oneTBB/actions/workflows/ci.yml/badge.svg)](https://github.com/uxlfoundation/oneTBB/actions/workflows/ci.yml?query=branch%3Amaster)
+[![Join the community on GitHub Discussions](https://badgen.net/badge/join%20the%20discussion/on%20github/blue?icon=github)](https://github.com/uxlfoundation/oneTBB/discussions)
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9125/badge)](https://www.bestpractices.dev/projects/9125)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/uxlfoundation/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/uxlfoundation/oneTBB)
+[![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20oneTBB%20Guru-006BFF)](https://gurubase.io/g/onetbb)
 [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/oneTBB)
+[![Coverity Scan Build Status](https://img.shields.io/coverity/scan/30373.svg)](https://scan.coverity.com/projects/oneapi-src-onetbb)
 
 oneTBB is a flexible C++ library that simplifies the work of adding parallelism
 to complex applications, even if you are not a threading expert.  
@@ -31,12 +34,12 @@ See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREME
 
 ## Documentation
 * [oneTBB Specification](https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html)
-* [oneTBB Developer Guide and Reference](https://oneapi-src.github.io/oneTBB)
-* [Migrating from TBB to oneTBB](https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Migration_Guide.html)
+* [oneTBB Developer Guide and Reference](https://uxlfoundation.github.io/oneTBB)
+* [Migrating from TBB to oneTBB](https://uxlfoundation.github.io/oneTBB/main/tbb_userguide/Migration_Guide.html)
 * [README for the CMake build system](cmake/README.md)
-* [oneTBB Testing Approach](https://oneapi-src.github.io/oneTBB/main/intro/testing_approach.html)
+* [oneTBB Testing Approach](https://uxlfoundation.github.io/oneTBB/main/intro/testing_approach.html)
 * [Basic support for the Bazel build system](Bazel.md)
-* [oneTBB Discussions](https://github.com/oneapi-src/oneTBB/discussions)
+* [oneTBB Discussions](https://github.com/uxlfoundation/oneTBB/discussions)
 * [WASM Support](WASM_Support.md)
 
 ## Installation 
@@ -63,8 +66,5 @@ Use GitHub Issues for feature requests, bug reports, and minor inquiries. For br
 oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt).
 By its terms, contributions submitted to the project are also done under that license.
 
-## Engineering team contacts
-* [Email us.](mailto:inteltbbdevelopers@intel.com)
-
 ------------------------------------------------------------------------
 \* All names and brands may be claimed as the property of others.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index c9b8e97135..16f04e0b92 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -18,8 +18,13 @@
 This document contains changes of oneTBB compared to the last release.
 
 ## Table of Contents <!-- omit in toc -->
+- [Preview Features](#preview-features)
 - [Known Limitations](#known-limitations)
-- [Fixed Issues](#fixed-issues)
+- [Issues Fixed](#issues-fixed)
+- [Open-Source Contributions Integrated](#open-source-contributions-integrated)
+
+## :tada: Preview Features
+- Extended the Flow Graph receiving nodes with a new ``try_put_and_wait`` API that submits a message to the graph and waits for its completion.
 
 ## :rotating_light: Known Limitations
 - The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc version lower than 2.5.
@@ -28,15 +33,19 @@ This document contains changes of oneTBB compared to the last release.
 - On Windows OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying /wd4324 to the compiler command line.
 - C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293.
 - When CPU resource coordination is enabled, tasks from a lower-priority ``task_arena`` might be executed before tasks from a higher-priority ``task_arena``.
+- Using oneTBB on WASM*, may cause applications to run in a single thread. See [Limitations of WASM Support](https://github.com/uxlfoundation/oneTBB/blob/master/WASM_Support.md#limitations).
+
+> **_NOTE:_**  To see known limitations that impact all versions of oneTBB, refer to [oneTBB Documentation](https://uxlfoundation.github.io/oneTBB/main/intro/limitations.html).
 
-> **_NOTE:_**  To see known limitations that impact all versions of oneTBB, refer to [oneTBB Documentation](https://oneapi-src.github.io/oneTBB/main/intro/limitations.html).
 
+## :hammer: Issues Fixed
+- Fixed the missed signal for thread request for enqueue operation.
+- Significantly improved scalability of ``task_group``, ``flow_graph``, and ``parallel_for_each``.
+- Removed usage of ``std::aligned_storage`` deprecated in C++23 (Inspired by Valery Matskevich https://github.com/uxlfoundation/oneTBB/pull/1394).
+- Fixed the issue where ``oneapi::tbb::info`` interfaces might interfere with the process affinity mask on the Windows* OS systems with multiple processor groups.
 
-## :hammer: Fixed Issues
-- Fixed ``parallel_for_each`` algorithm behavior for iterators defining ``iterator_concept`` trait instead of ``iterator_category``.
-- Fixed the redefinition issue for ``std::min`` and ``std::max`` on Windows* OS ([GitHub* #832](https://github.com/oneapi-src/oneTBB/issues/832)).
-- Fixed the incorrect binary search order in ``TBBConfig.cmake``.
-- Enabled the oneTBB library search using the pkg-config tool in Conda packages.
 
-## :octocat: Open-source Contributions Integrated
-- Fixed the compiler warning for missing virtual destructor. Contributed by Elias Engelbert Plank (https://github.com/oneapi-src/oneTBB/pull/1215).
+## :octocat: Open-Source Contributions Integrated
+- Detect the GNU Binutils version to determine WAITPKG support better. Contributed by Martijn Courteaux (https://github.com/uxlfoundation/oneTBB/pull/1347).
+- Fixed the build on non-English locales. Contributed by Vladislav Shchapov (https://github.com/uxlfoundation/oneTBB/pull/1450). 
+- Improved Bazel support. Contributed by Julian Amann (https://github.com/uxlfoundation/oneTBB/pull/1434).
diff --git a/SECURITY.md b/SECURITY.md
index 4926041fc2..c9be5beb7f 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -61,6 +61,6 @@ If you have any suggestions on how this Policy could be improved, submit
 an issue or a pull request to this repository. **Do not** report
 potential vulnerabilities or security flaws via a pull request.
 
-[1]: https://github.com/oneapi-src/oneTBB/releases/latest
-[2]: https://github.com/oneapi-src/oneTBB/security/advisories/new
-[3]: https://github.com/oneapi-src/oneTBB/security/advisories
+[1]: https://github.com/uxlfoundation/oneTBB/releases/latest
+[2]: https://github.com/uxlfoundation/oneTBB/security/advisories/new
+[3]: https://github.com/uxlfoundation/oneTBB/security/advisories
diff --git a/SUPPORT.md b/SUPPORT.md
index 47bb60a538..bbf24f2854 100644
--- a/SUPPORT.md
+++ b/SUPPORT.md
@@ -21,14 +21,14 @@ Use the following methods if you face any challenges.
 
 ## Issues
 
-If you have a problem, check out the [GitHub Issues](https://github.com/oneapi-src/oneTBB/issues) to see if the issue you want to address is already reported. 
+If you have a problem, check out the [GitHub Issues](https://github.com/uxlfoundation/oneTBB/issues) to see if the issue you want to address is already reported. 
 You may find users that have encountered the same bug or have similar ideas for changes or updates.
 
 You can use issues to report a problem, make a feature request, or add comments on an existing issue.
 
 ## Discussions 
 
-Visit the [GitHub Discussions](https://github.com/oneapi-src/oneTBB/discussions) to engage with the community, ask questions, or help others. 
+Visit the [GitHub Discussions](https://github.com/uxlfoundation/oneTBB/discussions) to engage with the community, ask questions, or help others. 
 
 ## Email
 
diff --git a/WASM_Support.md b/WASM_Support.md
index 6306620d7c..f40cf38c3d 100644
--- a/WASM_Support.md
+++ b/WASM_Support.md
@@ -47,7 +47,7 @@ Where:
 * ``-DCMAKE_C_COMPILER=emcc`` - specifies the C compiler as Emscripten* C compiler.
 
 
-> **_NOTE:_** See [CMake documentation](https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md) to learn about other options. 
+> **_NOTE:_** See [CMake documentation](https://github.com/uxlfoundation/oneTBB/blob/master/cmake/README.md) to learn about other options. 
 
 
 ## Run Test
diff --git a/cmake/README.md b/cmake/README.md
index 60df73c072..4e0d8d54aa 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -10,6 +10,7 @@ TBB_SANITIZE:STRING - Sanitizer parameter, passed to compiler/linker
 TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post-install step for libraries if provided.
 TBB_SIGNTOOL_ARGS:STRING - Additional arguments for TBB_SIGNTOOL, used if TBB_SIGNTOOL is set.
 TBB_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) build (ON by default)
+TBB_FIND_PACKAGE - Enable search for external oneTBB using find_package instead of build from sources (OFF by default)
 TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator build (ON by default)
 TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default)
 TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default)
@@ -19,7 +20,10 @@ TBB_INSTALL_VARS:BOOL - Enable auto-generated vars installation(packages generat
 TBB_VALGRIND_MEMCHECK:BOOL - Enable scan for memory leaks using Valgrind (OFF by default)
 TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH - Disable HWLOC automatic search by pkg-config tool (OFF by default)
 TBB_ENABLE_IPO - Enable Interprocedural Optimization (IPO) during the compilation (ON by default)
+TBB_CONTROL_FLOW_GUARD:BOOL - Enable Control Flow Guard (CFG) during the compilation (OFF by default)
 TBB_BUILD_APPLE_FRAMEWORKS - Enable the Apple* frameworks instead of dylibs, only available on the Apple platform. (OFF by default)
+TBB_FILE_TRIM - Enable __FILE__ trim, replace a build-time full path with a relative path in the debug info and macro __FILE__; use it to make
+           reproducible location-independent builds (ON by default)
 ```
 
 ## Configure, Build, and Test
@@ -45,7 +49,7 @@ Some useful options:
 
 #### TBBBind Library Configuration
 
-> **_TIP:_** It is recommended to install the HWLOC* library. See [oneTBB documentation](https://oneapi-src.github.io/oneTBB/GSG/next_steps.html#hybrid-cpu-and-numa-support) for details.
+> **_TIP:_** It is recommended to install the HWLOC* library. See [oneTBB documentation](https://uxlfoundation.github.io/oneTBB/GSG/next_steps.html#hybrid-cpu-and-numa-support) for details.
 
 The TBBbind library has three versions: `tbbbind`, `tbbbind_2_0`, and `tbbbind_2_5`. Each of these versions is linked with the corresponding HWLOC* library version: 
 - `tbbbind` links with `HWLOC 1.11.x`
@@ -267,7 +271,7 @@ Variable | Description
 `TBB_VERSION`           | oneTBB version (format: `<major>.<minor>.<patch>.<tweak>`)
 `TBB_IMPORTED_TARGETS`  | All created oneTBB imported targets (not supported for builds from source code)
 
-Starting from [oneTBB 2021.1](https://github.com/oneapi-src/oneTBB/releases/tag/v2021.1), GitHub* release TBBConfig files in the binary packages are located under `<tbb-root>/lib/cmake/TBB`.
+Starting from [oneTBB 2021.1](https://github.com/uxlfoundation/oneTBB/releases/tag/v2021.1), GitHub* release TBBConfig files in the binary packages are located under `<tbb-root>/lib/cmake/TBB`.
 For example, `TBB_DIR` should be set to `<tbb-root>/lib/cmake/TBB`.
 
 TBBConfig files are automatically created during the build from source code and can be installed together with the library.
diff --git a/cmake/compilers/AppleClang.cmake b/cmake/compilers/AppleClang.cmake
index 5ebbdbd1a6..9cf0b08628 100644
--- a/cmake/compilers/AppleClang.cmake
+++ b/cmake/compilers/AppleClang.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -42,6 +42,10 @@ if ("${_tbb_target_architectures}" MATCHES "(x86_64|amd64|AMD64)") # OSX systems
 endif()
 unset(_tbb_target_architectures)
 
+if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10)
+    set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=)
+endif ()
+
 # TBB malloc settings
 set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions)
 
diff --git a/cmake/compilers/Clang.cmake b/cmake/compilers/Clang.cmake
index dcd66634f3..a0297faa82 100644
--- a/cmake/compilers/Clang.cmake
+++ b/cmake/compilers/Clang.cmake
@@ -80,6 +80,10 @@ if (MINGW)
     list(APPEND TBB_COMMON_COMPILE_FLAGS -U__STRICT_ANSI__)
 endif()
 
+if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10)
+    set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=)
+endif ()
+
 set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-flto>)
 set(TBB_IPO_LINK_FLAGS $<$<NOT:$<CONFIG:Debug>>:-flto>)
 
diff --git a/cmake/compilers/GNU.cmake b/cmake/compilers/GNU.cmake
index 3b9af64263..da6b408af3 100644
--- a/cmake/compilers/GNU.cmake
+++ b/cmake/compilers/GNU.cmake
@@ -82,6 +82,9 @@ if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Intel)
     # gcc 6.0 and later have -flifetime-dse option that controls elimination of stores done outside the object lifetime
     set(TBB_DSE_FLAG $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},6.0>>:-flifetime-dse=1>)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},8.0>>:-fstack-clash-protection>)
+
+    # Suppress GCC 12.x-14.x warning here that to_wait_node(n)->my_is_in_list might have size 0
+    set(TBB_COMMON_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} $<$<AND:$<NOT:$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},12.0>>,$<VERSION_LESS:${CMAKE_CXX_COMPILER_VERSION},15.0>>:-Wno-stringop-overflow>)
 endif()
 
 # Workaround for heavy tests and too many symbols in debug (rellocation truncated to fit: R_MIPS_CALL16)
@@ -110,6 +113,10 @@ if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE")
   set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$<NOT:$<CONFIG:Debug>>:-D_FORTIFY_SOURCE=2> )
 endif ()
 
+if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8)
+    set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=)
+endif ()
+
 # TBB malloc settings
 set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions)
 set(TBB_OPENMP_FLAG -fopenmp)
diff --git a/cmake/compilers/IntelLLVM.cmake b/cmake/compilers/IntelLLVM.cmake
index a9ebb3e670..b514378164 100644
--- a/cmake/compilers/IntelLLVM.cmake
+++ b/cmake/compilers/IntelLLVM.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@ if (WIN32)
 else()
     include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake)
     set(TBB_IPO_COMPILE_FLAGS $<$<NOT:$<CONFIG:Debug>>:-ipo>)
+     # "--exclude-libs,ALL" is used to avoid accidental exporting of symbols
+    #  from statically linked libraries
+    set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,--exclude-libs,ALL)
     set(TBB_OPENMP_FLAG -qopenmp)
 endif()
 set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS})
diff --git a/cmake/compilers/MSVC.cmake b/cmake/compilers/MSVC.cmake
index 6568ec7eb8..b49e543776 100644
--- a/cmake/compilers/MSVC.cmake
+++ b/cmake/compilers/MSVC.cmake
@@ -77,6 +77,17 @@ if (TBB_WINDOWS_DRIVER)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /D _UNICODE /DUNICODE /DWINAPI_FAMILY=WINAPI_FAMILY_APP /D__WRL_NO_DEFAULT_LIB__)
 endif()
 
+if (TBB_FILE_TRIM)
+    add_compile_options(
+        "$<$<COMPILE_LANGUAGE:CXX>:/d1trimfile:${NATIVE_TBB_PROJECT_ROOT_DIR}\\>"
+        "$<$<COMPILE_LANGUAGE:CXX>:/d1trimfile:${CMAKE_SOURCE_DIR}/>")
+endif()
+
+if (TBB_CONTROL_FLOW_GUARD)
+    set(TBB_LIB_COMPILE_FLAGS ${TBB_LIB_COMPILE_FLAGS} /guard:cf)
+    set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /guard:cf)
+endif()
+
 if (CMAKE_CXX_COMPILER_ID MATCHES "(Clang|IntelLLVM)")
     if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86|AMD64|i.86)")
         set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm -mwaitpkg)
diff --git a/doc/GSG/get_started.rst b/doc/GSG/get_started.rst
index d437ce89b8..2af04be6b0 100644
--- a/doc/GSG/get_started.rst
+++ b/doc/GSG/get_started.rst
@@ -8,11 +8,36 @@ It is helpful for new users of parallel programming and experienced developers t
 
 It is recommended for you to have a basic knowledge of C++ programming and some experience with parallel programming concepts. 
 
+|full_name| is a runtime-based parallel programming model for C++ code that uses tasks.
+The template-based runtime library can help you harness the latent performance of multi-core processors.
+
+oneTBB enables you to simplify parallel programming by breaking computation into parallel running tasks. Within a single process, 
+parallelism is carried out by mapping tasks to threads. Threads are an operating system mechanism that allows the same or different sets of instructions 
+to be executed simultaneously. Using threads can make your program work faster and more efficiently.
+
+Here you can see one of the possible executions of tasks by threads.
+
+.. figure:: Images/how-oneTBB-works.png
+   :scale: 70%
+   :align: center
+
+Use oneTBB to write scalable applications that:
+
+* Specify logical parallel structure instead of threads.
+* Emphasize data-parallel programming.
+* Take advantage of concurrent collections and parallel algorithms.
+
+oneTBB supports nested parallelism and load balancing. It means that you can use the library without worrying about oversubscribing a system, which happens when more tasks are assigned to a system than it can handle efficiently. 
+
+oneTBB is used in different areas, such as scientific simulations, gaming, data analysis, etc. 
+
+It is available as a stand-alone product and as part of the |base_tk|.
+
 
 To start using oneTBB, follow the next steps:
 *********************************************
 
-#. Learn what :ref:`oneTBB is<intro>` and see the :ref:`System Requirements<system_requirements>`. 
+#. See the :ref:`System Requirements<system_requirements>`. 
 #. :ref:`Install oneTBB<installation>`.
 #. Run your program using oneTBB following the :ref:`Next Steps <next_steps>`. 
 #. Learn how to :ref:`Integrate oneTBB into your project <integrate>` using CMake* and pkg-config tool. 
diff --git a/doc/GSG/installation.rst b/doc/GSG/installation.rst
index e6b6a09c34..d8f1933265 100644
--- a/doc/GSG/installation.rst
+++ b/doc/GSG/installation.rst
@@ -3,5 +3,5 @@
 Installation
 ============
 
-See the `installation instructions <https://github.com/oneapi-src/oneTBB/blob/master/INSTALL.md>`_ 
+See the `installation instructions <https://github.com/uxlfoundation/oneTBB/blob/master/INSTALL.md>`_ 
 that will help you to install |short_name| successfully.  
\ No newline at end of file
diff --git a/doc/GSG/integrate.rst b/doc/GSG/integrate.rst
index 13fd679dab..2b38dba246 100644
--- a/doc/GSG/integrate.rst
+++ b/doc/GSG/integrate.rst
@@ -26,10 +26,10 @@ Integrating oneTBB into your project using CMake*:
 
 To add oneTBB to another project using CMake*, add the following commands to your ``CMakeLists.txt`` file:
 
-.. code-block::
+.. code-block:: cmake
 
-       `find_package(TBB REQUIRED)`
-       `target_link_libraries(my_executable TBB::tbb)`
+       find_package(TBB REQUIRED)
+       target_link_libraries(my_executable TBB::tbb)
 
 After that, configure your project with CMake* as usual.
 
diff --git a/doc/GSG/intro.rst b/doc/GSG/intro.rst
deleted file mode 100644
index da8c558d21..0000000000
--- a/doc/GSG/intro.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. _intro:
-
-What oneTBB Is 
-==============
-
-|full_name| is a runtime-based parallel programming model for C++ code that uses threads.
-The template-based runtime library can help you harness the latent performance of multi-core processors.
-
-oneTBB enables you to simplify parallel programming by breaking computation into parallel running tasks. Within a single process, 
-parallelism is carried out through threads, an operating system mechanism that allows the same or different sets of instructions 
-to be executed simultaneously. Using threads can make your program work faster and more efficiently.
-
-Here you can see one of the possible executions of tasks by threads.
-
-.. figure:: Images/how-oneTBB-works.png
-   :scale: 70%
-   :align: center
-
-Use oneTBB to write scalable applications that:
-
-* Specify logical parallel structure instead of threads.
-* Emphasize data-parallel programming.
-* Take advantage of concurrent collections and parallel algorithms.
-
-oneTBB supports nested parallelism and load balancing. It means that you can use the library without worrying about oversubscribing a system, which happens when more tasks are assigned to a system than it can handle efficiently. 
-
-oneTBB is used in different areas, such as scientific simulations, gaming, data analysis, etc. 
-
-It is available as a stand-alone product and as part of the |base_tk|.
diff --git a/doc/GSG/samples.rst b/doc/GSG/samples.rst
index f19a256238..18bf812801 100644
--- a/doc/GSG/samples.rst
+++ b/doc/GSG/samples.rst
@@ -10,40 +10,40 @@ The following samples are available:
 
 * **Containers** 
 
-  * `concurrent_hash_map <https://github.com/oneapi-src/oneTBB/tree/master/examples/concurrent_hash_map>`_ 
-  * `concurrent_priority_queue <https://github.com/oneapi-src/oneTBB/tree/master/examples/concurrent_priority_queue>`_ 
-
-* `Flow Graph <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph>`_ 
-   * `A solution to the binpacking problem using a queue_node, a buffer_node, and function_node. <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/binpack>`_ 
-   * `Cholesky Factorization algorithm <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/cholesky>`_
-   * `An implementation of dining philosophers in graph using the reserving join_node <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/dining_philosophers>`_
-   * `A parallel implementation of bzip2 block-sorting file compressor <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/fgbzip2>`_
-   * `An example of a collection of digital logic gates that can be easily composed into larger circuits <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/logic_sim>`_
-   * `An example of a Kohonen Self-Organizing Map using cancellation <https://github.com/oneapi-src/oneTBB/tree/master/examples/graph/som>`_
+  * `concurrent_hash_map <https://github.com/uxlfoundation/oneTBB/tree/master/examples/concurrent_hash_map>`_ 
+  * `concurrent_priority_queue <https://github.com/uxlfoundation/oneTBB/tree/master/examples/concurrent_priority_queue>`_ 
+
+* `Flow Graph <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph>`_ 
+   * `A solution to the binpacking problem using a queue_node, a buffer_node, and function_node. <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/binpack>`_ 
+   * `Cholesky Factorization algorithm <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/cholesky>`_
+   * `An implementation of dining philosophers in graph using the reserving join_node <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/dining_philosophers>`_
+   * `A parallel implementation of bzip2 block-sorting file compressor <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/fgbzip2>`_
+   * `An example of a collection of digital logic gates that can be easily composed into larger circuits <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/logic_sim>`_
+   * `An example of a Kohonen Self-Organizing Map using cancellation <https://github.com/uxlfoundation/oneTBB/tree/master/examples/graph/som>`_
    * `Split computational kernel for execution between CPU and GPU <https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB/tbb-async-sycl>`_
 
 * **Algorithms**
 
-  * `parallel_for <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for>`_
-     * `Game of life overlay <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for/game_of_life>`_
-     * `Polygon overlay <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for/polygon_overlay>`_
-     * `Parallel seismic wave simulation <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for/seismic>`_
-     * `Parallel 2-D raytracer/renderer <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for/tachyon>`_
-     * `Find largest matching substrings <https://github.com/oneapi-src/oneTBB/tree/master/examples/getting_started>`_
+  * `parallel_for <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for>`_
+     * `Game of life overlay <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for/game_of_life>`_
+     * `Polygon overlay <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for/polygon_overlay>`_
+     * `Parallel seismic wave simulation <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for/seismic>`_
+     * `Parallel 2-D raytracer/renderer <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for/tachyon>`_
+     * `Find largest matching substrings <https://github.com/uxlfoundation/oneTBB/tree/master/examples/getting_started>`_
      * `Resumable task: Split computational kernel for execution between CPU and GPU <https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB/tbb-resumable-tasks-sycl>`_
-  * `parallel_for_each <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_for_each>`_
-  * `parallel_pipeline <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_pipeline>`_
-  * `parallel_reduce <https://github.com/oneapi-src/oneTBB/tree/master/examples/parallel_reduce>`_
+  * `parallel_for_each <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_for_each>`_
+  * `parallel_pipeline <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_pipeline>`_
+  * `parallel_reduce <https://github.com/uxlfoundation/oneTBB/tree/master/examples/parallel_reduce>`_
 
 * **Task Scheduler**
 
-  * `task_arena <https://github.com/oneapi-src/oneTBB/tree/master/examples/task_arena>`_
-  * `task_group <https://github.com/oneapi-src/oneTBB/tree/master/examples/task_group>`_
+  * `task_arena <https://github.com/uxlfoundation/oneTBB/tree/master/examples/task_arena>`_
+  * `task_group <https://github.com/uxlfoundation/oneTBB/tree/master/examples/task_group>`_
   * `Execute similar computational kernels, with one task executing the SYCL* code and the other task executing the oneTBB code <https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB/tbb-task-sycl>`_
 
 * **Other**
 
-  * `Compute Fibonacci numbers in different ways <https://github.com/oneapi-src/oneTBB/tree/master/examples/test_all>`_
+  * `Compute Fibonacci numbers in different ways <https://github.com/uxlfoundation/oneTBB/tree/master/examples/test_all>`_
 
 
 .. note:: You can also refer to the `oneAPI Samples <https://oneapi-src.github.io/oneAPI-samples/>`_ to learn more about the ecosystem. 
\ No newline at end of file
diff --git a/doc/GSG/system_requirements.rst b/doc/GSG/system_requirements.rst
index d5e951f35a..593680147f 100644
--- a/doc/GSG/system_requirements.rst
+++ b/doc/GSG/system_requirements.rst
@@ -3,4 +3,4 @@
 System Requirements
 *******************
 
-Refer to the `oneTBB System Requirements <https://github.com/oneapi-src/oneTBB/blob/master/SYSTEM_REQUIREMENTS.md>`_.
\ No newline at end of file
+Refer to the `oneTBB System Requirements <https://github.com/uxlfoundation/oneTBB/blob/master/SYSTEM_REQUIREMENTS.md>`_.
\ No newline at end of file
diff --git a/doc/README.md b/doc/README.md
index 0cdd56f9c1..0e17d2e2dd 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -15,7 +15,7 @@ Do the following to generate HTML output of the documentation:
 1. Clone oneTBB repository:
 
 ```
-git clone https://github.com/oneapi-src/oneTBB.git
+git clone https://github.com/uxlfoundation/oneTBB.git
 ```
 
 2. Go to the `doc` folder:
diff --git a/doc/conf.py b/doc/conf.py
index 263b7c5c4c..a16dd5dec9 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -126,13 +126,13 @@
 
 if BUILD_TYPE == 'dita':
     html_theme_options = {
-        'repository_url': 'https://github.com/oneapi-src/oneTBB',
+        'repository_url': 'https://github.com/uxlfoundation/oneTBB',
         'path_to_docs': 'doc',
         'repository_branch': 'master'
     }
 else:
     html_theme_options = {
-        'repository_url': 'https://github.com/oneapi-src/oneTBB',
+        'repository_url': 'https://github.com/uxlfoundation/oneTBB',
         'path_to_docs': 'doc',
         'use_issues_button': True,
         'use_edit_page_button': True,
@@ -140,9 +140,7 @@
     }
 
 if BUILD_TYPE != 'oneapi' and BUILD_TYPE != 'dita':
-   html_theme_options = {
-    "extra_footer": "<div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie- notice.html'>| Do Not Share My Personal Information</a> </div><div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div><br><div>oneTBB is licensed under Apache License Version 2.0. Refer to the <a href='https://github.com/oneapi-src/oneTBB/blob/master/LICENSE.txt'>LICENSE </a> file for the full license text and copyright notice.</div>"
-   }
+   html_theme_options["extra_footer"]="<div><a href='https://www.intel.com/content/www/us/en/privacy/intel-cookie-notice.html' data-cookie-notice='true'>Cookies</a> <a href='https://www.intel.com/content/www/us/en/privacy/intel-privacy-notice.html'>| Privacy</a> <a data-wap_ref='dns' id='wap_dns' href='https://www.intel.com/content/www/us/en/privacy/intel-cookie- notice.html'>| Do Not Share My Personal Information</a> </div><div>&copy; Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), <a href='http://opensource.org/licenses/0BSD'>http://opensource.org/licenses/0BSD</a>. </div><br><div>oneTBB is licensed under Apache License Version 2.0. Refer to the <a href='https://github.com/uxlfoundation/oneTBB/blob/master/LICENSE.txt'>LICENSE </a> file for the full license text and copyright notice.</div>"
 
     
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -159,11 +157,7 @@
 else:
     html_js_files = ['custom.js']
 
-html_theme_options = { 
-    "logo": {
-        "text": "oneTBB Documentation",
-    }
-}
+html_theme_options["logo"] = {"text": "oneTBB Documentation"}
     
 html_logo = '_static/oneAPI-rgb-rev-100.png'
 html_favicon = '_static/favicons.png'
@@ -304,7 +298,7 @@
 # -- Options for intersphinx extension ---------------------------------------
 
 # Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
+intersphinx_mapping = {'python': ('https://docs.python.org/3', None)}
 
 # -- Options for todo extension ----------------------------------------------
 
diff --git a/doc/index/toctree.rst b/doc/index/toctree.rst
index fba9aee46c..542a4bb601 100644
--- a/doc/index/toctree.rst
+++ b/doc/index/toctree.rst
@@ -17,7 +17,6 @@
    :maxdepth: 2
 
    /GSG/get_started
-   /GSG/intro
    /GSG/system_requirements
    /GSG/installation
    /GSG/next_steps 
diff --git a/doc/main/intro/help_support.rst b/doc/main/intro/help_support.rst
index 278083d658..a1ab4097bc 100644
--- a/doc/main/intro/help_support.rst
+++ b/doc/main/intro/help_support.rst
@@ -12,4 +12,4 @@ Getting Help and Support
 
    For general information about oneTBB technical support, product
    updates, user forums, FAQs, tips and tricks and other support
-   questions, go to `GitHub issues <https://github.com/oneapi-src/oneTBB/issues>`_.
+   questions, go to `GitHub issues <https://github.com/uxlfoundation/oneTBB/issues>`_.
diff --git a/doc/main/reference/reference.rst b/doc/main/reference/reference.rst
index c8ba0af944..4c293c02c7 100644
--- a/doc/main/reference/reference.rst
+++ b/doc/main/reference/reference.rst
@@ -50,3 +50,4 @@ The key properties of a preview feature are:
     concurrent_lru_cache_cls
     task_group_extensions
     custom_mutex_chmap
+    try_put_and_wait
diff --git a/doc/main/reference/rvalue_reduce.rst b/doc/main/reference/rvalue_reduce.rst
index 69d480d465..7cf66d86b3 100644
--- a/doc/main/reference/rvalue_reduce.rst
+++ b/doc/main/reference/rvalue_reduce.rst
@@ -33,7 +33,8 @@ or
 
 .. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const
 
-    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/range>_`.
+    Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the 
+    `Range requirements <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/named_requirements/algorithms/range>`_. 
     The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm <https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onetbb/source/algorithms/functions/parallel_reduce_func>`_.
 
     If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred.
@@ -55,6 +56,7 @@ Example
 *******
 
 .. code:: cpp
+    
     // C++17
     #include <oneapi/tbb/parallel_reduce.h>
     #include <oneapi/tbb/blocked_range.h>
diff --git a/doc/main/reference/try_put_and_wait.rst b/doc/main/reference/try_put_and_wait.rst
new file mode 100644
index 0000000000..4e05961f39
--- /dev/null
+++ b/doc/main/reference/try_put_and_wait.rst
@@ -0,0 +1,324 @@
+.. _try_put_and_wait:
+
+Waiting for Single Messages in Flow Graph
+=========================================
+
+.. contents::
+    :local:
+    :depth: 1
+
+Description
+***********
+
+This feature adds a new ``try_put_and_wait`` interface to the receiving nodes in the Flow Graph.
+This function puts a message as an input into a Flow Graph and waits until all work related to
+that message is complete.
+``try_put_and_wait`` may reduce latency compared to calling ``graph::wait_for_all`` since
+``graph::wait_for_all`` waits for all work, including work that is unrelated to the input message, to complete.
+
+``node.try_put_and_wait(msg)`` performs ``node.try_put(msg)`` on the node and waits until the work on ``msg`` is completed.
+Therefore, the following conditions are true:
+
+* Any task initiated by any node in the Flow Graph that involves working with ``msg`` or any other intermediate result
+  computed from ``msg`` is completed.
+* No intermediate results computed from ``msg`` remain in any buffers in the graph.
+
+.. caution::
+
+    To prevent ``try_put_and_wait`` calls from infinite waiting, avoid using buffering nodes at the end of the Flow Graph since the final result
+    will not be automatically consumed by the Flow Graph.
+
+.. caution::
+
+    The ``multifunction_node`` and ``async_node`` classes are not currently supported by this feature. Including one of these nodes in the
+    Flow Graph may cause ``try_put_and_wait`` to exit early, even if the computations on the initial input message are
+    still in progress.
+
+API
+***
+
+Header
+------
+
+.. code:: cpp
+
+    #define TBB_PREVIEW_FLOW_GRAPH_FEATURES // macro option 1
+    #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // macro option 2
+    #include <oneapi/tbb/flow_graph.h>
+
+Synopsis
+--------
+
+.. code:: cpp
+
+    namespace oneapi {
+        namespace tbb {
+            template <typename Output, typename Policy = /*default-policy*/>
+            class continue_node {
+            public:
+                bool try_put_and_wait(const continue_msg& input);
+            }; // class continue_node
+
+            template <typename Input, typename Output = continue_msg, typename Policy = /*default-policy*/>
+            class function_node {
+            public:
+                bool try_put_and_wait(const Input& input);
+            }; // class function_node
+
+            template <typename T>
+            class overwrite_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class overwrite_node
+
+            template <typename T>
+            class write_once_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class write_once_node
+
+            template <typename T>
+            class buffer_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class buffer_node
+
+            template <typename T>
+            class queue_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class queue_node
+
+            template <typename T, typename Compare = std::less<T>>
+            class priority_queue_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class priority_queue_node
+
+            template <typename T>
+            class sequencer_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class sequencer_node
+
+            template <typename T, typename DecrementType = continue_msg>
+            class limiter_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class limiter_node
+
+            template <typename T>
+            class broadcast_node {
+            public:
+                bool try_put_and_wait(const T& input);
+            }; // class broadcast_node
+
+            template <typename TupleType>
+            class split_node {
+            public:
+                bool try_put_and_wait(const TupleType& input);
+            }; // class split_node
+        } // namespace tbb
+    } // namespace oneapi
+
+Member Functions
+----------------
+
+.. code:: cpp
+
+    template <typename Output, typename Policy>
+    bool continue_node<Output, Policy>::try_put_and_wait(const continue_msg& input)
+
+**Effects**: Increments the count of input signals received. If the incremented count is equal to the number
+of known predecessors, performs the ``body`` function object execution.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename Input, typename Output, typename Policy>
+    bool function_node<Input, Output, Policy>::try_put_and_wait(const Input& input)
+
+**Effects**: If the concurrency limit allows, executes the user-provided body on the incoming message ``input``.
+Otherwise, depending on the ``Policy`` of the node, either queues the incoming message ``input`` or rejects it.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` if the input is accepted, ``false`` otherwise.
+
+.. code:: cpp
+
+    template <typename T>
+    bool overwrite_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Stores ``input`` in the internal single-item buffer and broadcasts it to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. caution::
+
+    Since the input element is not retrieved from ``overwrite_node`` once accepted by the successor,
+    retrieve it by explicitly calling the ``clear()`` method or by overwriting with another element to prevent
+    ``try_put_and_wait`` from indefinite waiting.
+
+.. code:: cpp
+
+    template <typename T>
+    bool write_once_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Stores ``input`` in the internal single-item buffer if it does not contain a valid value already.
+If a new value is set, the node broadcasts it to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` for the first time after construction or a call to ``clear()``.
+
+.. caution::
+
+    Since the input element is not retrieved from the ``write_once_node`` once accepted by the successor,
+    retrieve it by explicitly calling the ``clear()`` method to prevent ``try_put_and_wait`` from indefinite waiting.
+
+.. code:: cpp
+
+    template <typename T>
+    bool buffer_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding it to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T>
+    bool queue_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding the least recently added item
+to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T, typename Compare>
+    bool priority_queue_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the ``priority_queue_node`` and attempts to forward the item with the highest
+priority among all items added to the node but not yet forwarded to the successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T>
+    bool sequencer_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Adds ``input`` to the ``sequencer_node`` and tries forwarding the next item in sequence to a successor.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+.. code:: cpp
+
+    template <typename T, typename DecrementType>
+    bool limiter_node<T, DecrementType>::try_put_and_wait(const T& input)
+
+**Effects**: If the broadcast count is below the threshold, broadcasts ``input`` to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` if ``input`` is broadcasted; ``false`` otherwise.
+
+.. code:: cpp
+
+    template <typename T>
+    bool broadcast_node<T>::try_put_and_wait(const T& input)
+
+**Effects**: Broadcasts ``input`` to all successors.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true`` even if the node cannot successfully forward the message to any of its successors.
+
+.. code:: cpp
+
+    template <typename TupleType>
+    bool split_node<TupleType>::try_put_and_wait(const TupleType& input);
+
+**Effects**: Broadcasts each element in the incoming tuple to the nodes connected to the ``split_node`` output ports.
+The element at index ``i`` of ``input`` is broadcasted through the output port number ``i``.
+
+Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and
+related to ``input`` are executed, and no related objects remain in any buffer within the graph.
+
+**Returns**: ``true``.
+
+Example
+*******
+
+.. code:: cpp
+
+    #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    #include <oneapi/tbb/flow_graph.h>
+    #include <oneapi/tbb/parallel_for.h>
+
+    struct f1_body;
+    struct f2_body;
+    struct f3_body;
+    struct f4_body;
+
+    int main() {
+        using namespace oneapi::tbb;
+
+        flow::graph g;
+        flow::broadcast_node<int> start_node(g);
+
+        flow::function_node<int, int> f1(g, flow::unlimited, f1_body{});
+        flow::function_node<int, int> f2(g, flow::unlimited, f2_body{});
+        flow::function_node<int, int> f3(g, flow::unlimited, f3_body{});
+
+        flow::join_node<std::tuple<int, int>> join(g);
+
+        flow::function_node<std::tuple<int, int>, int> f4(g, flow::serial, f4_body{});
+
+        flow::make_edge(start_node, f1);
+        flow::make_edge(f1, f2);
+
+        flow::make_edge(start_node, f3);
+
+        flow::make_edge(f2, flow::input_port<0>(join));
+        flow::make_edge(f3, flow::input_port<1>(join));
+
+        flow::make_edge(join, f4);
+
+        // Submit work into the graph
+        parallel_for(0, 100, [](int input) {
+            start_node.try_put_and_wait(input);
+
+            // Post processing the result of input
+        });
+    }
+
+Each iteration of ``parallel_for`` submits an input into the Flow Graph. After returning from ``try_put_and_wait(input)``, it is
+guaranteed that all of the work related to the completion of ``input`` is done by all of the nodes in the graph. Tasks related to inputs
+submitted by other calls are not guaranteed to be completed.
diff --git a/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst b/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst
index 3352dd8d32..99446ab659 100644
--- a/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst
+++ b/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst
@@ -72,4 +72,40 @@ along its longest axis. When used with ``parallel_for``, it causes the
 loop to be "recursively blocked" in a way that improves cache usage.
 This nice cache behavior means that using ``parallel_for`` over a
 ``blocked_range2d<T>`` can make a loop run faster than the sequential
-equivalent, even on a single processor.
+equivalent, even on a single processor. 
+
+The ``blocked_range2d`` allows you to use different value types for
+its first dimension, *rows*, and the second one, *columns*.
+That means you can combine indexes, pointers, and iterators into a joint
+iteration space. Use the methods ``rows()`` and ``cols()`` to obtain
+``blocked_range`` objects that represent the respective dimensions.
+
+The ``blocked_range3d`` class template extends this approach to 3D by adding
+``pages()`` as the first dimension, followed by ``rows()`` and ``cols()``.
+
+The ``blocked_nd_range<T,N>`` class template represents a blocked iteration
+space of any dimensionality. Unlike the previously described 2D and 3D ranges,
+``blocked_nd_range`` uses the same value type for all its axes, and its
+constructor requires you to pass N instances of ``blocked_range<T>`` instead of
+individual boundary values. The change in the naming pattern reflects these
+differences.
+
+
+Example of a Multidimensional Iteration Space
+------------------------------------------------
+
+The example demonstrates calculation of a 3-dimensional filter over the pack
+of feature maps.
+
+The ``convolution3d`` function iterates over the output cells, assigning to
+each cell the result of the ``kernel3d`` function that combines the values
+from a range in the feature maps.
+
+To run the computation in parallel, ``tbb::parallel_for`` is called with
+``tbb::blocked_nd_range<int,3>`` as an argument. The body function processes
+the received 3D subrange in nested loops, using the method ``dim`` to get
+the loop boundaries for each dimension.
+
+
+.. literalinclude:: ./snippets/blocked_nd_range_example.h
+   :language: c++
diff --git a/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst b/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
index 724b8b6ec9..290f2f2cc3 100644
--- a/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
+++ b/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst
@@ -22,14 +22,11 @@ the following steps generally occur:
    thread that invoked the algorithm.
 
 
-The exception thrown in step 3 might be the original exception, or might
-merely be a summary of type ``captured_exception``. The latter usually
-occurs on current systems because propagating exceptions between threads
-requires support for the C++ ``std::exception_ptr`` functionality. As
-compilers evolve to support this functionality, future versions of
+As compilers evolve to support this functionality, future versions of
 oneTBB might throw the original exception. So be sure your code can
 catch either type of exception. The following example demonstrates
-exception handling.
+exception handling:
+
 
 
 ::
diff --git a/doc/main/tbb_userguide/Linux_OS.rst b/doc/main/tbb_userguide/Linux_OS.rst
index 1d25a04dcd..0f0c245720 100644
--- a/doc/main/tbb_userguide/Linux_OS.rst
+++ b/doc/main/tbb_userguide/Linux_OS.rst
@@ -25,12 +25,12 @@ structure for Linux\*, relative to *<tbb_install_dir>*
       - | ``LIBRARY_PATH``
 	| ``LD_LIBRARY_PATH``
 
-where
+Where:
 
 * ``<arch>`` - ``ia32`` or ``intel64``
+  
+   .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library. 
 
 * ``<lib>`` - ``libtbb``, ``libtbbmalloc``, ``libtbbmalloc_proxy`` or ``libtbbbind``
-
 * ``<variant>`` - ``_debug`` or empty
-
-* ``<version>`` - binary version in a form of ``<major>.<minor>``
\ No newline at end of file
+* ``<version>`` - binary version in a form of ``<major>.<minor>``
diff --git a/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst b/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
index f4f78ae567..cd2d2e1a93 100644
--- a/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
+++ b/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst
@@ -44,7 +44,6 @@ To do the replacement use one of the following methods:
 -  Alternatively, add the following parameters to the linker options for
    the .exe or .dll file that is loaded during application startup.
 
-
    For 32-bit code (note the triple underscore):
 
 
@@ -52,8 +51,7 @@ To do the replacement use one of the following methods:
 
 
       tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy"
-
-
+   
    For 64-bit code (note the double underscore):
 
 
diff --git a/doc/main/tbb_userguide/Windows_OS_ug.rst b/doc/main/tbb_userguide/Windows_OS_ug.rst
index 3fc4a5a223..85fc3306ce 100644
--- a/doc/main/tbb_userguide/Windows_OS_ug.rst
+++ b/doc/main/tbb_userguide/Windows_OS_ug.rst
@@ -30,12 +30,13 @@ structure for Windows\*, relative to <*tbb_install_dir*>.
         - Same as corresponding ``.dll`` file.
         - \
 
-where
+Where
 
 * ``<arch>`` - ``ia32`` or ``intel64``
 
-* ``<lib>`` - ``tbb``, ``tbbmalloc``, ``tbbmalloc_proxy`` or ``tbbbind``
+  .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library.
 
+* ``<lib>`` - ``tbb``, ``tbbmalloc``, ``tbbmalloc_proxy`` or ``tbbbind``
 * ``<vcversion>`` 
 
   - ``14`` - use for dynamic linkage  with the CRT
@@ -47,11 +48,10 @@ where
   - ``_mt`` - use for static linkage with the CRT
 
 * ``<variant>`` - ``_debug`` or empty
-
 * ``<version>`` - binary version
  
-The last column shows which environment variables are used by the
-Microsoft\* Visual C++\* or Intel® C++ Compiler Classic or Intel® oneAPI DPC++/C++ Compiler to find these
+The last column shows, which environment variables are used by the
+Microsoft\* Visual C++\* or Intel® C++ Compiler Classic or Intel® oneAPI DPC++/C++ Compiler, to find these
 subdirectories.
 
 .. CAUTION:: 
diff --git a/doc/main/tbb_userguide/parallel_for_os.rst b/doc/main/tbb_userguide/parallel_for_os.rst
index fed07af68b..cbc7578f4c 100644
--- a/doc/main/tbb_userguide/parallel_for_os.rst
+++ b/doc/main/tbb_userguide/parallel_for_os.rst
@@ -55,8 +55,9 @@ before each identifier. The rest of the examples assume that such a
 Note the argument to ``operator()``. A ``blocked_range<T>`` is a
 template class provided by the library. It describes a one-dimensional
 iteration space over type ``T``. Class ``parallel_for`` works with other
-kinds of iteration spaces too. The library provides ``blocked_range2d``
-for two-dimensional spaces. You can define your own spaces as explained
+kinds of iteration spaces too. The library provides ``blocked_range2d``,
+``blocked_range3d``, and ``blocked_nd_range`` for multidimensional spaces.
+You can define your own spaces as explained
 in :ref:`Advanced_Topic_Other_Kinds_of_Iteration_Spaces`.
 
 
diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp
new file mode 100644
index 0000000000..7417123999
--- /dev/null
+++ b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp
@@ -0,0 +1,37 @@
+#include "blocked_nd_range_example.h"
+#include <vector>
+#include <cassert>
+
+int main() {
+    const int kernel_length = 9;
+    const int kernel_width = 5;
+    const int kernel_height = 5;
+
+    const int feature_maps_length = 128;
+    const int feature_maps_width = 16;
+    const int feature_maps_heigth = 16;
+
+    const int out_length = feature_maps_length - kernel_length + 1;
+    const int out_width = feature_maps_width - kernel_width + 1;
+    const int out_heigth = feature_maps_heigth - kernel_height + 1;
+
+    // Initializes feature maps with 1 in each cell and out with zeros.
+    std::vector<std::vector<std::vector<float>>> feature_maps(feature_maps_length, std::vector<std::vector<float>>(feature_maps_width, std::vector<float>(feature_maps_heigth, 1.0f)));
+    std::vector<std::vector<std::vector<float>>> out(out_length, std::vector<std::vector<float>>(out_width, std::vector<float>(out_heigth, 0.f)));
+
+    // 3D convolution calculates the sum of all elements in the kernel
+    convolution3d(feature_maps, out,
+                  out_length, out_width, out_heigth,
+                  kernel_length, kernel_width, kernel_height);
+
+    // Checks correctness of convolution by equality to the expected sum of elements
+    float expected = float(kernel_length * kernel_height * kernel_width);
+    for (auto i : out) {
+        for (auto j : i) {
+            for (auto k : j) {
+                assert(k == expected && "convolution failed to calculate correctly");
+            }
+        }
+    }
+    return 0;
+}
diff --git a/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h
new file mode 100644
index 0000000000..ded2a09c57
--- /dev/null
+++ b/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h
@@ -0,0 +1,37 @@
+#include "oneapi/tbb/blocked_nd_range.h"
+#include "oneapi/tbb/parallel_for.h"
+
+template<typename Features>
+float kernel3d(const Features& feature_maps, int i, int j, int k,
+               int kernel_length, int kernel_width, int kernel_height) {
+    float result = 0.f;
+
+    for (int feature_i = i; feature_i < i + kernel_length; ++feature_i)
+        for (int feature_j = j; feature_j < j + kernel_width; ++feature_j)
+            for (int feature_k = k; feature_k < k + kernel_width; ++feature_k)
+                result += feature_maps[feature_i][feature_j][feature_k];
+
+    return result;
+}
+
+template<typename Features, typename Output>
+void convolution3d(const Features& feature_maps, Output& out,
+                   int out_length, int out_width, int out_heigth,
+                   int kernel_length, int kernel_width, int kernel_height) {
+    using range_t = oneapi::tbb::blocked_nd_range<int, 3>;
+
+    oneapi::tbb::parallel_for(
+        range_t({0, out_length}, {0, out_width}, {0, out_heigth}),
+        [&](const range_t& out_range) {
+            auto out_x = out_range.dim(0);
+            auto out_y = out_range.dim(1);
+            auto out_z = out_range.dim(2);
+
+            for (int i = out_x.begin(); i < out_x.end(); ++i)
+                for (int j = out_y.begin(); j < out_y.end(); ++j)
+                    for (int k = out_z.begin(); k < out_z.end(); ++k)
+                        out[i][j][k] = kernel3d(feature_maps, i, j, k,
+                                                kernel_length, kernel_width, kernel_height);
+        }
+    );
+}
diff --git a/doc/main/tbb_userguide/std_invoke.rst b/doc/main/tbb_userguide/std_invoke.rst
index 17ee7add99..d536eae8b2 100644
--- a/doc/main/tbb_userguide/std_invoke.rst
+++ b/doc/main/tbb_userguide/std_invoke.rst
@@ -204,14 +204,14 @@ Find More
 
 The following APIs supports Callable object as Bodies: 
 
-* `parallel_for <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_for_func.html>`_
-* `parallel_reduce <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_
-* `parallel_deterministic_reduce <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_deterministic_reduce_func.html>`_
-* `parallel_for_each <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_for_each_func.html>`_
-* `parallel_scan <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_scan_func.html>`_ 
-* `parallel_pipeline <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_pipeline_func.html>`_ 
-* `function_node <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/func_node_cls.html>`_ 
-* `multifunction_node <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/multifunc_node_cls.html>`_ 
-* `async_node <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/async_node_cls.html>`_ 
-* `sequencer_node <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/sequencer_node_cls.html>`_ 
-* `join_node with key_matching policy <https://oneapi-src.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/join_node_cls.html>`_ 
+* `parallel_for <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_for_func.html>`_
+* `parallel_reduce <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_reduce_func.html>`_
+* `parallel_deterministic_reduce <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_deterministic_reduce_func.html>`_
+* `parallel_for_each <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_for_each_func.html>`_
+* `parallel_scan <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_scan_func.html>`_ 
+* `parallel_pipeline <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/algorithms/functions/parallel_pipeline_func.html>`_ 
+* `function_node <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/func_node_cls.html>`_ 
+* `multifunction_node <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/multifunc_node_cls.html>`_ 
+* `async_node <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/async_node_cls.html>`_ 
+* `sequencer_node <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/sequencer_node_cls.html>`_ 
+* `join_node with key_matching policy <https://uxlfoundation.github.io/oneAPI-spec/spec/elements/oneTBB/source/flow_graph/join_node_cls.html>`_ 
diff --git a/examples/README.md b/examples/README.md
index 037ca4d4e3..bf38ffba24 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,7 +3,7 @@ This directory contains example usages of oneAPI Threading Building Blocks.
 
 | Code sample name | Description
 |:--- |:---
-| getting_started/sub_string_finder | Example referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://oneapi-src.github.io/oneTBB/GSG/get_started.html#get-started-guide). Finds the largest matching substrings.
+| getting_started/sub_string_finder | Example referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://uxlfoundation.github.io/oneTBB/GSG/get_started.html#get-started-guide). Finds the largest matching substrings.
 | concurrent_hash_map/count_strings | Concurrently inserts strings into a `concurrent_hash_map` container.
 | concurrent_priority_queue/shortpath | Solves the single source shortest path problem using a  `concurrent_priority_queue` container.
 | graph/binpack | A solution to the binpacking problem using a `queue_node`, a `buffer_node`, and `function_node`s.
@@ -26,7 +26,7 @@ This directory contains example usages of oneAPI Threading Building Blocks.
 | test_all/fibonacci | Compute Fibonacci numbers in different ways.
 
 ## System Requirements
-Refer to the [System Requirements](https://github.com/oneapi-src/oneTBB/blob/master/SYSTEM_REQUIREMENTS.md) for the list of supported hardware and software.
+Refer to the [System Requirements](https://github.com/uxlfoundation/oneTBB/blob/master/SYSTEM_REQUIREMENTS.md) for the list of supported hardware and software.
 
 ### Graphical User Interface (GUI)
 Some examples (e.g., fractal, seismic, tachyon, polygon_overlay) support different GUI modes, which may be defined via the `EXAMPLES_UI_MODE` CMake variable. 
diff --git a/examples/common/utility/utility.hpp b/examples/common/utility/utility.hpp
index 024f3e99c1..0630c81362 100644
--- a/examples/common/utility/utility.hpp
+++ b/examples/common/utility/utility.hpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cassert>
 #include <cstring>
 #include <cstdlib>
+#include <cmath>
 
 #include <utility>
 #include <string>
@@ -32,6 +33,7 @@
 #include <stdexcept>
 #include <memory>
 #include <iostream>
+#include <chrono>
 // TBB headers should not be used, as some examples may need to be built without TBB.
 
 namespace utility {
@@ -356,6 +358,59 @@ class cli_argument_pack {
     }
 }; // class cli_argument_pack
 
+// utility class to aid relative error measurement of samples
+class measurements {
+public:
+    measurements() = default;
+
+    measurements(unsigned iterations) {
+        _time_intervals.reserve(iterations);
+    }
+
+    inline void start() {
+        _startTime = std::chrono::steady_clock::now();
+    }
+    inline void stop() {
+        auto _endTime = std::chrono::steady_clock::now();
+        // store the end time and start time
+        _time_intervals.push_back(std::make_pair(_startTime, _endTime));
+    }
+    double computeRelError() {
+        // Accumulate the total duration in microseconds using std::accumulate with a lambda function
+        assert(0 != _time_intervals.size());
+        auto total_duration = std::accumulate(
+            _time_intervals.begin(),
+            _time_intervals.end(),
+            0, // Start with 0 count
+            [](long long total, const std::pair<time_point, time_point>& interval) {
+                // Compute the difference and add it to the total
+                return total + std::chrono::duration_cast<std::chrono::microseconds>(
+                                   interval.second - interval.first)
+                                   .count();
+            });
+        unsigned long long averageTimePerFrame = total_duration / _time_intervals.size();
+        unsigned long long sumOfSquareDiff = 0;
+        std::for_each(_time_intervals.begin(),
+                      _time_intervals.end(),
+                      [&](const std::pair<time_point, time_point>& interval) {
+                          unsigned long long duration =
+                              std::chrono::duration_cast<std::chrono::microseconds>(
+                                  interval.second - interval.first)
+                                  .count();
+                          long long diff = duration - averageTimePerFrame;
+                          sumOfSquareDiff += diff * diff;
+                      });
+        double stdDev = std::sqrt(sumOfSquareDiff / _time_intervals.size());
+        double relError = 100 * (stdDev / averageTimePerFrame);
+        return relError;
+    }
+
+private:
+    using time_point = std::chrono::steady_clock::time_point;
+    time_point _startTime;
+    std::vector<std::pair<time_point, time_point>> _time_intervals;
+};
+
 namespace internal {
 template <typename T>
 bool is_power_of_2(T val) {
@@ -547,6 +602,11 @@ inline void report_skipped() {
               << "\n";
 }
 
+inline void report_relative_error(double err) {
+    std::cout << "Relative_Err : " << err << " %"
+              << "\n";
+}
+
 inline void parse_cli_arguments(int argc, const char* argv[], utility::cli_argument_pack cli_pack) {
     bool show_help = false;
     cli_pack.arg(show_help, "-h", "show this message");
diff --git a/examples/migration/recursive_fibonacci/task_emulation_layer.h b/examples/migration/recursive_fibonacci/task_emulation_layer.h
index 7252d447a0..e3b67b93e9 100644
--- a/examples/migration/recursive_fibonacci/task_emulation_layer.h
+++ b/examples/migration/recursive_fibonacci/task_emulation_layer.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2023-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 namespace task_emulation {
 
 struct task_group_pool {
-    task_group_pool() : pool_size(std::thread::hardware_concurrency()), task_submitters(new tbb::task_group[pool_size]) {}
+    task_group_pool() : pool_size(tbb::this_task_arena::max_concurrency()), task_submitters(new tbb::task_group[pool_size]) {}
 
     ~task_group_pool() {
         for (std::size_t i = 0; i < pool_size; ++i) {
diff --git a/examples/parallel_for/seismic/main.cpp b/examples/parallel_for/seismic/main.cpp
index cc30bbd3b8..4f08de9342 100644
--- a/examples/parallel_for/seismic/main.cpp
+++ b/examples/parallel_for/seismic/main.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,14 +35,17 @@ struct RunOptions {
     //!                  threads.second - initialization value for scheduler
     utility::thread_number_range threads;
     int numberOfFrames;
+    int numberOfIterations;
     bool silent;
     bool parallel;
     RunOptions(utility::thread_number_range threads_,
                int number_of_frames_,
+               int number_of_iterations_,
                bool silent_,
                bool parallel_)
             : threads(threads_),
               numberOfFrames(number_of_frames_),
+              numberOfIterations(number_of_iterations_),
               silent(silent_),
               parallel(parallel_) {}
 };
@@ -53,6 +56,7 @@ RunOptions ParseCommandLine(int argc, char *argv[]) {
         utility::get_default_num_threads, 0, utility::get_default_num_threads());
 
     int numberOfFrames = 0;
+    int numberOfIterations = 0;
     bool silent = false;
     bool serial = false;
 
@@ -65,15 +69,19 @@ RunOptions ParseCommandLine(int argc, char *argv[]) {
             .positional_arg(numberOfFrames,
                             "n-of-frames",
                             "number of frames the example processes internally (0 means unlimited)")
+            .positional_arg(numberOfIterations,
+                            "n-of-iterations",
+                            "number of iterations the example runs internally")
             .arg(silent, "silent", "no output except elapsed time")
             .arg(serial, "serial", "in GUI mode start with serial version of algorithm"));
-    return RunOptions(threads, numberOfFrames, silent, !serial);
+    return RunOptions(threads, numberOfFrames, numberOfIterations, silent, !serial);
 }
 
 int main(int argc, char *argv[]) {
     oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now();
     RunOptions options = ParseCommandLine(argc, argv);
     SeismicVideo video(u, options.numberOfFrames, options.threads.last, options.parallel);
+    double rel_error;
 
     // video layer init
     if (video.init_window(u.UniverseWidth, u.UniverseHeight)) {
@@ -91,11 +99,19 @@ int main(int argc, char *argv[]) {
             std::cout << "Substituting 1000 for unlimited frames because not running interactively"
                       << "\n";
         }
+        // TODO : Extend utility::cli_argument_pack() to allow specifying the default value.
+        if (options.numberOfIterations <= 0) {
+            options.numberOfIterations = 10;
+            std::cout << "Setting the number of iterations = 10 default"
+                      << "\n";
+        }
         for (int p = options.threads.first; p <= options.threads.last;
              p = options.threads.step(p)) {
             oneapi::tbb::tick_count xwayParallelismStartTime = oneapi::tbb::tick_count::now();
             u.InitializeUniverse(video);
             int numberOfFrames = options.numberOfFrames;
+            assert(options.numberOfIterations > 0 && "Number of iterations cannot be <= 0");
+            unsigned numberOfIterations = unsigned(options.numberOfIterations);
 
             if (p == 0) {
                 //run a serial version
@@ -106,9 +122,15 @@ int main(int argc, char *argv[]) {
             else {
                 oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism,
                                               p);
-                for (int i = 0; i < numberOfFrames; ++i) {
-                    u.ParallelUpdateUniverse();
+                utility::measurements mu(numberOfIterations);
+                for (int iter = 0; iter < numberOfIterations; ++iter) {
+                    mu.start();
+                    for (int i = 0; i < numberOfFrames; ++i) {
+                        u.ParallelUpdateUniverse();
+                    }
+                    mu.stop();
                 }
+                rel_error = mu.computeRelError();
             }
 
             if (!options.silent) {
@@ -129,5 +151,6 @@ int main(int argc, char *argv[]) {
     }
     video.terminate();
     utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds());
+    utility::report_relative_error(rel_error);
     return 0;
 }
diff --git a/include/oneapi/tbb.h b/include/oneapi/tbb.h
index ad96011373..c52eb9e228 100644
--- a/include/oneapi/tbb.h
+++ b/include/oneapi/tbb.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -28,9 +28,7 @@
 #include "oneapi/tbb/blocked_range.h"
 #include "oneapi/tbb/blocked_range2d.h"
 #include "oneapi/tbb/blocked_range3d.h"
-#if TBB_PREVIEW_BLOCKED_RANGE_ND
-#include "tbb/blocked_rangeNd.h"
-#endif
+#include "oneapi/tbb/blocked_nd_range.h"
 #include "oneapi/tbb/cache_aligned_allocator.h"
 #include "oneapi/tbb/combinable.h"
 #include "oneapi/tbb/concurrent_hash_map.h"
diff --git a/include/oneapi/tbb/blocked_rangeNd.h b/include/oneapi/tbb/blocked_nd_range.h
similarity index 58%
rename from include/oneapi/tbb/blocked_rangeNd.h
rename to include/oneapi/tbb/blocked_nd_range.h
index a7ba137506..3a9697896f 100644
--- a/include/oneapi/tbb/blocked_rangeNd.h
+++ b/include/oneapi/tbb/blocked_nd_range.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2017-2021 Intel Corporation
+    Copyright (c) 2017-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -14,12 +14,8 @@
     limitations under the License.
 */
 
-#ifndef __TBB_blocked_rangeNd_H
-#define __TBB_blocked_rangeNd_H
-
-#if !TBB_PREVIEW_BLOCKED_RANGE_ND
-    #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h
-#endif
+#ifndef __TBB_blocked_nd_range_H
+#define __TBB_blocked_nd_range_H
 
 #include <algorithm>    // std::any_of
 #include <array>
@@ -28,6 +24,7 @@
 
 #include "detail/_config.h"
 #include "detail/_template_helpers.h" // index_sequence, make_index_sequence
+#include "detail/_namespace_injection.h"
 #include "detail/_range_common.h"
 
 #include "blocked_range.h"
@@ -37,45 +34,56 @@ namespace detail {
 namespace d1 {
 
 /*
-    The blocked_rangeNd_impl uses make_index_sequence<N> to automatically generate a ctor with
+    The blocked_nd_range_impl uses make_index_sequence<N> to automatically generate a ctor with
     exactly N arguments of the type tbb::blocked_range<Value>. Such ctor provides an opportunity
     to use braced-init-list parameters to initialize each dimension.
     Use of parameters, whose representation is a braced-init-list, but they're not
     std::initializer_list or a reference to one, produces a non-deduced context
     within template argument deduction.
 
-    NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl
+    NOTE: blocked_nd_range must be exactly a templated alias to the blocked_nd_range_impl
     (and not e.g. a derived class), otherwise it would need to declare its own ctor
     facing the same problem that the impl class solves.
 */
 
 template<typename Value, unsigned int N, typename = detail::make_index_sequence<N>>
     __TBB_requires(blocked_range_value<Value>)
-class blocked_rangeNd_impl;
+class blocked_nd_range_impl;
 
 template<typename Value, unsigned int N, std::size_t... Is>
     __TBB_requires(blocked_range_value<Value>)
-class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
+class blocked_nd_range_impl<Value, N, detail::index_sequence<Is...>> {
 public:
     //! Type of a value.
     using value_type = Value;
 
-private:
-    //! Helper type to construct range with N tbb::blocked_range<value_type> objects.
-    template<std::size_t>
-    using dim_type_helper = tbb::blocked_range<value_type>;
+    //! Type of a dimension range.
+    using dim_range_type = tbb::blocked_range<value_type>;
 
-public:
-    blocked_rangeNd_impl() = delete;
+    //! Type for the size of a range.
+    using size_type = typename dim_range_type::size_type;
+
+    blocked_nd_range_impl() = delete;
 
     //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range<Value>.
-    blocked_rangeNd_impl(const dim_type_helper<Is>&... args) : my_dims{ {args...} } {}
+    blocked_nd_range_impl(const indexed_t<dim_range_type, Is>&... args) : my_dims{ {args...} } {}
+
+#if __clang__ && __TBB_CLANG_VERSION < 140000
+    // On clang prior to version 14.0.0, passing a single braced init list to the constructor of blocked_nd_range<T, 1>
+    // matches better on the C array constructor and generates compile-time error because of unexpected size
+    // Adding constraints for this constructor to force the compiler to drop it from overload resolution if the size is unexpected
+    template <unsigned int M, typename = typename std::enable_if<M == N>::type>
+    blocked_nd_range_impl(const value_type (&size)[M], size_type grainsize = 1) :
+#else
+    blocked_nd_range_impl(const value_type (&size)[N], size_type grainsize = 1) :
+#endif
+        my_dims { dim_range_type(0, size[Is], grainsize)... } {}
 
     //! Dimensionality of a range.
-    static constexpr unsigned int ndims() { return N; }
+    static constexpr unsigned int dim_count() { return N; }
 
     //! Range in certain dimension.
-    const tbb::blocked_range<value_type>& dim(unsigned int dimension) const {
+    const dim_range_type& dim(unsigned int dimension) const {
         __TBB_ASSERT(dimension < N, "out of bound");
         return my_dims[dimension];
     }
@@ -86,44 +94,45 @@ class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
 
     //! True if at least one dimension is empty.
     bool empty() const {
-        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const dim_range_type& d) {
             return d.empty();
         });
     }
 
     //! True if at least one dimension is divisible.
     bool is_divisible() const {
-        return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& d) {
+        return std::any_of(my_dims.begin(), my_dims.end(), [](const dim_range_type& d) {
             return d.is_divisible();
         });
     }
 
-    blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) {
+    blocked_nd_range_impl(blocked_nd_range_impl& r, proportional_split proportion) : my_dims(r.my_dims) {
         do_split(r, proportion);
     }
 
-    blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) {
+    blocked_nd_range_impl(blocked_nd_range_impl& r, split proportion) : my_dims(r.my_dims) {
         do_split(r, proportion);
     }
 
 private:
-    static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed");
+    static_assert(N != 0, "zero dimensional blocked_nd_range can't be constructed");
 
     //! Ranges in each dimension.
-    std::array<tbb::blocked_range<value_type>, N> my_dims;
+    std::array<dim_range_type, N> my_dims;
 
     template<typename split_type>
-    void do_split(blocked_rangeNd_impl& r, split_type proportion) {
-        static_assert((std::is_same<split_type, split>::value || std::is_same<split_type, proportional_split>::value), "type of split object is incorrect");
+    void do_split(blocked_nd_range_impl& r, split_type proportion) {
+        static_assert((std::is_same<split_type, split>::value || std::is_same<split_type, proportional_split>::value),
+                      "type of split object is incorrect");
         __TBB_ASSERT(r.is_divisible(), "can't split not divisible range");
 
-        auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range<value_type>& first, const tbb::blocked_range<value_type>& second) {
-            return (first.size() * second.grainsize() < second.size() * first.grainsize());
+        auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const dim_range_type& first, const dim_range_type& second) {
+            return (first.size() * double(second.grainsize()) < second.size() * double(first.grainsize()));
         });
 
         auto r_it = r.my_dims.begin() + (my_it - my_dims.begin());
 
-        my_it->my_begin = tbb::blocked_range<value_type>::do_split(*r_it, proportion);
+        my_it->my_begin = dim_range_type::do_split(*r_it, proportion);
 
         // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to
         // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept
@@ -133,15 +142,14 @@ class blocked_rangeNd_impl<Value, N, detail::index_sequence<Is...>> {
 };
 
 template<typename Value, unsigned int N>
-using blocked_rangeNd = blocked_rangeNd_impl<Value, N>;
+using blocked_nd_range = blocked_nd_range_impl<Value, N>;
 
 } // namespace d1
 } // namespace detail
 
 inline namespace v1 {
-using detail::d1::blocked_rangeNd;
+using detail::d1::blocked_nd_range;
 } // namespace v1
 } // namespace tbb
 
-#endif /* __TBB_blocked_rangeNd_H */
-
+#endif /* __TBB_blocked_nd_range_H */
diff --git a/include/oneapi/tbb/blocked_range.h b/include/oneapi/tbb/blocked_range.h
index 12862fa2a1..5193faffd5 100644
--- a/include/oneapi/tbb/blocked_range.h
+++ b/include/oneapi/tbb/blocked_range.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -152,7 +152,7 @@ class blocked_range {
 
     template<typename DimValue, unsigned int N, typename>
         __TBB_requires(blocked_range_value<DimValue>)
-    friend class blocked_rangeNd_impl;
+    friend class blocked_nd_range_impl;
 };
 
 } // namespace d1
diff --git a/include/oneapi/tbb/collaborative_call_once.h b/include/oneapi/tbb/collaborative_call_once.h
index e3742347f1..18e3bbb245 100644
--- a/include/oneapi/tbb/collaborative_call_once.h
+++ b/include/oneapi/tbb/collaborative_call_once.h
@@ -172,7 +172,7 @@ class collaborative_once_flag : no_copy {
             spin_wait_until_eq(m_state, expected);
         } while (!m_state.compare_exchange_strong(expected, desired));
     }
-    
+
     template <typename Fn>
     void do_collaborative_call_once(Fn&& f) {
         std::uintptr_t expected = m_state.load(std::memory_order_acquire);
diff --git a/include/oneapi/tbb/concurrent_unordered_map.h b/include/oneapi/tbb/concurrent_unordered_map.h
index 336425cc8f..9cade0a94e 100644
--- a/include/oneapi/tbb/concurrent_unordered_map.h
+++ b/include/oneapi/tbb/concurrent_unordered_map.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,14 +24,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_map_traits {
     using value_type = std::pair<const Key, T>;
     using key_type = Key;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<Key, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<Key, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -399,13 +399,13 @@ void swap( concurrent_unordered_multimap<Key, T, Hash, KeyEqual, Allocator>& lhs
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_map;
-using detail::d1::concurrent_unordered_multimap;
+using detail::d2::concurrent_unordered_map;
+using detail::d2::concurrent_unordered_multimap;
 using detail::split;
 
 } // inline namespace v1
diff --git a/include/oneapi/tbb/concurrent_unordered_set.h b/include/oneapi/tbb/concurrent_unordered_set.h
index c135b92222..b7e4b4cafc 100644
--- a/include/oneapi/tbb/concurrent_unordered_set.h
+++ b/include/oneapi/tbb/concurrent_unordered_set.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Key, typename Hash, typename KeyEqual, typename Allocator, bool AllowMultimapping>
 struct concurrent_unordered_set_traits {
     using key_type = Key;
     using value_type = key_type;
     using allocator_type = Allocator;
-    using hash_compare_type = hash_compare<key_type, Hash, KeyEqual>;
+    using hash_compare_type = d1::hash_compare<key_type, Hash, KeyEqual>;
     static constexpr bool allow_multimapping = AllowMultimapping;
 
     static constexpr const key_type& get_key( const value_type& value ) {
@@ -318,13 +318,13 @@ void swap( concurrent_unordered_multiset<Key, Hash, KeyEqual, Allocator>& lhs,
     lhs.swap(rhs);
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 
 inline namespace v1 {
 
-using detail::d1::concurrent_unordered_set;
-using detail::d1::concurrent_unordered_multiset;
+using detail::d2::concurrent_unordered_set;
+using detail::d2::concurrent_unordered_multiset;
 using detail::split;
 
 } // inline namespace v1
diff --git a/include/oneapi/tbb/concurrent_vector.h b/include/oneapi/tbb/concurrent_vector.h
index 2a2cb1e4bf..27cdc47355 100644
--- a/include/oneapi/tbb/concurrent_vector.h
+++ b/include/oneapi/tbb/concurrent_vector.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -593,7 +593,7 @@ class concurrent_vector
 
             segment_type disabled_segment = nullptr;
             if (table[0].compare_exchange_strong(disabled_segment, new_segment)) {
-                this->extend_table_if_necessary(table, 0, first_block_size);
+                this->extend_table_if_necessary(table, /*start_index*/0, /*end_index*/first_block_size);
                 for (size_type i = 1; i < first_block; ++i) {
                     table[i].store(new_segment, std::memory_order_release);
                 }
@@ -826,8 +826,8 @@ class concurrent_vector
 
     template <typename... Args>
     iterator internal_grow( size_type start_idx, size_type end_idx, const Args&... args ) {
-        this->assign_first_block_if_necessary(this->segment_index_of(end_idx - 1) + 1);
         size_type seg_index = this->segment_index_of(end_idx - 1);
+        this->assign_first_block_if_necessary(seg_index + 1);
         segment_table_type table = this->get_table();
         this->extend_table_if_necessary(table, start_idx, end_idx);
 
diff --git a/include/oneapi/tbb/detail/_concurrent_unordered_base.h b/include/oneapi/tbb/detail/_concurrent_unordered_base.h
index be1f46b20e..85f54d0a57 100644
--- a/include/oneapi/tbb/detail/_concurrent_unordered_base.h
+++ b/include/oneapi/tbb/detail/_concurrent_unordered_base.h
@@ -46,7 +46,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Traits>
 class concurrent_unordered_base;
@@ -171,7 +171,7 @@ class value_node : public list_node<SokeyType>
     value_node( sokey_type ord_key ) : base_type(ord_key) {}
     ~value_node() {}
     value_type* storage() {
-        return reinterpret_cast<value_type*>(&my_value);
+        return &my_value;
     }
 
     value_type& value() {
@@ -179,8 +179,9 @@ class value_node : public list_node<SokeyType>
     }
 
 private:
-    using aligned_storage_type = typename std::aligned_storage<sizeof(value_type)>::type;
-    aligned_storage_type my_value;
+    union {
+        value_type my_value;
+    };
 }; // class value_node
 
 template <typename Traits>
@@ -237,7 +238,7 @@ class concurrent_unordered_base {
     template <typename T>
     using is_transparent = dependent_bool<has_transparent_key_equal<key_type, hasher, key_equal>, T>;
 public:
-    using node_type = node_handle<key_type, value_type, value_node_type, allocator_type>;
+    using node_type = d1::node_handle<key_type, value_type, value_node_type, allocator_type>;
 
     explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(),
                                         const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() )
@@ -441,7 +442,7 @@ class concurrent_unordered_base {
 
     std::pair<iterator, bool> insert( node_type&& nh ) {
         if (!nh.empty()) {
-            value_node_ptr insert_node = node_handle_accessor::get_node_ptr(nh);
+            value_node_ptr insert_node = d1::node_handle_accessor::get_node_ptr(nh);
             auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr {
                 insert_node->init(order_key);
                 return insert_node;
@@ -451,7 +452,7 @@ class concurrent_unordered_base {
                 // If the insertion succeeded - set node handle to the empty state
                 __TBB_ASSERT(insert_result.remaining_node == nullptr,
                             "internal_insert_node should not return the remaining node if the insertion succeeded");
-                node_handle_accessor::deactivate(nh);
+                d1::node_handle_accessor::deactivate(nh);
             }
             return { iterator(insert_result.node_with_equal_key), insert_result.inserted };
         }
@@ -521,12 +522,12 @@ class concurrent_unordered_base {
 
     node_type unsafe_extract( const_iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( iterator pos ) {
         internal_extract(pos.get_node_ptr());
-        return node_handle_accessor::construct<node_type>(pos.get_node_ptr());
+        return d1::node_handle_accessor::construct<node_type>(pos.get_node_ptr());
     }
 
     node_type unsafe_extract( const key_type& key ) {
@@ -787,11 +788,11 @@ class concurrent_unordered_base {
     static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1;
 
     class unordered_segment_table
-        : public segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
+        : public d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>
     {
         using self_type = unordered_segment_table;
         using atomic_node_ptr = std::atomic<node_ptr>;
-        using base_type = segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
+        using base_type = d1::segment_table<std::atomic<node_ptr>, allocator_type, unordered_segment_table, pointers_per_embedded_table>;
         using segment_type = typename base_type::segment_type;
         using base_allocator_type = typename base_type::allocator_type;
 
@@ -1212,7 +1213,7 @@ class concurrent_unordered_base {
 
                     // Node handle with curr cannot be used directly in insert call, because
                     // the destructor of node_type will destroy curr
-                    node_type curr_node = node_handle_accessor::construct<node_type>(curr);
+                    node_type curr_node = d1::node_handle_accessor::construct<node_type>(curr);
 
                     // If the insertion fails - return ownership of the node to the source
                     if (!insert(std::move(curr_node)).second) {
@@ -1230,7 +1231,7 @@ class concurrent_unordered_base {
                         curr->set_next(next_node);
                         source_prev->set_next(curr);
                         source_prev = curr;
-                        node_handle_accessor::deactivate(curr_node);
+                        d1::node_handle_accessor::deactivate(curr_node);
                     } else {
                         source.my_size.fetch_sub(1, std::memory_order_relaxed);
                     }
@@ -1507,7 +1508,7 @@ bool operator!=( const concurrent_unordered_base<Traits>& lhs,
 #pragma warning(pop) // warning 4127 is back
 #endif
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_config.h b/include/oneapi/tbb/detail/_config.h
index 0e5fbfe92f..e676b1558b 100644
--- a/include/oneapi/tbb/detail/_config.h
+++ b/include/oneapi/tbb/detail/_config.h
@@ -521,6 +521,11 @@
 #define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET       (TBB_PREVIEW_FLOW_GRAPH_FEATURES)
 #endif
 
+#ifndef __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#define __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT (TBB_PREVIEW_FLOW_GRAPH_FEATURES \
+                                                   || TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT)
+#endif
+
 #if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS
 #define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1
 #endif
diff --git a/include/oneapi/tbb/detail/_exception.h b/include/oneapi/tbb/detail/_exception.h
index 21c61188d0..d1aa1fc69a 100644
--- a/include/oneapi/tbb/detail/_exception.h
+++ b/include/oneapi/tbb/detail/_exception.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -66,10 +66,16 @@ class TBB_EXPORT missing_wait : public std::exception {
 };
 
 //! Exception for impossible finalization of task_sheduler_handle
+#if __APPLE__
+    #pragma GCC visibility push(default)
+#endif
 class TBB_EXPORT unsafe_wait : public std::runtime_error {
 public:
     unsafe_wait(const char* msg) : std::runtime_error(msg) {}
 };
+#if __APPLE__
+    #pragma GCC visibility pop
+#endif
 
 //! Gathers all throw operators in one place.
 /** Its purpose is to minimize code bloat that can be caused by throw operators
diff --git a/include/oneapi/tbb/detail/_export.h b/include/oneapi/tbb/detail/_export.h
index 4c015223b5..24b6c08efb 100644
--- a/include/oneapi/tbb/detail/_export.h
+++ b/include/oneapi/tbb/detail/_export.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -18,27 +18,35 @@
 #define __TBB_detail__export_H
 
 #if defined(__MINGW32__)
-    #define _EXPORT __declspec(dllexport)
-#elif defined(_WIN32) || defined(__unix__) || defined(__APPLE__) // Use .def files for these
-    #define _EXPORT
+    #define __TBB_EXPORT __declspec(dllexport)
+#elif defined(_WIN32) // Use .def files for these
+    #define __TBB_EXPORT
+#elif defined(__unix__) || defined(__APPLE__) // Use .def files for these
+    #define __TBB_EXPORT __attribute__ ((visibility ("default")))
 #else
     #error "Unknown platform/compiler"
 #endif
 
 #if __TBB_BUILD
-    #define TBB_EXPORT _EXPORT
+    #define TBB_EXPORT __TBB_EXPORT
 #else
     #define TBB_EXPORT
 #endif
 
 #if __TBBMALLOC_BUILD
-    #define TBBMALLOC_EXPORT _EXPORT
+    #define TBBMALLOC_EXPORT __TBB_EXPORT
 #else
     #define TBBMALLOC_EXPORT
 #endif
 
+#if __TBBMALLOCPROXY_BUILD
+    #define TBBMALLOCPROXY_EXPORT __TBB_EXPORT
+#else
+    #define TBBMALLOCPROXY_EXPORT
+#endif
+
 #if __TBBBIND_BUILD
-    #define TBBBIND_EXPORT _EXPORT
+    #define TBBBIND_EXPORT __TBB_EXPORT
 #else
     #define TBBBIND_EXPORT
 #endif
diff --git a/include/oneapi/tbb/detail/_flow_graph_body_impl.h b/include/oneapi/tbb/detail/_flow_graph_body_impl.h
index 8ac11211f6..21da06ce03 100644
--- a/include/oneapi/tbb/detail/_flow_graph_body_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_body_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 typedef std::uint64_t tag_value;
 
@@ -53,7 +53,7 @@ namespace graph_policy_namespace {
     // K == type of field used for key-matching.  Each tag-matching port will be provided
     // functor that, given an object accepted by the port, will return the
     /// field of type K being used for matching.
-    template<typename K, typename KHash=tbb_hash_compare<typename std::decay<K>::type > >
+    template<typename K, typename KHash=d1::tbb_hash_compare<typename std::decay<K>::type > >
         __TBB_requires(tbb::detail::hash_compare<KHash, K>)
     struct key_matching {
         typedef K key_type;
@@ -77,7 +77,7 @@ template< typename Output >
 class input_body : no_assign {
 public:
     virtual ~input_body() {}
-    virtual Output operator()(flow_control& fc) = 0;
+    virtual Output operator()(d1::flow_control& fc) = 0;
     virtual input_body* clone() = 0;
 };
 
@@ -86,7 +86,7 @@ template< typename Output, typename Body>
 class input_body_leaf : public input_body<Output> {
 public:
     input_body_leaf( const Body &_body ) : body(_body) { }
-    Output operator()(flow_control& fc) override { return body(fc); }
+    Output operator()(d1::flow_control& fc) override { return body(fc); }
     input_body_leaf* clone() override {
         return new input_body_leaf< Output, Body >(body);
     }
@@ -249,12 +249,12 @@ template< typename NodeType >
 class forward_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    forward_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n
+    forward_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n
                          , node_priority_t node_priority = no_priority
     ) : graph_task(g, allocator, node_priority),
     my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.forward_task();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -264,7 +264,7 @@ class forward_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<forward_task_bypass>(ed);
         return nullptr;
     }
@@ -272,29 +272,57 @@ class forward_task_bypass : public graph_task {
 
 //! A task that calls a node's apply_body_bypass function, passing in an input of type Input
 //  return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr
-template< typename NodeType, typename Input >
-class apply_body_task_bypass : public graph_task {
+template< typename NodeType, typename Input, typename BaseTaskType = graph_task>
+class apply_body_task_bypass
+    : public BaseTaskType
+{
     NodeType &my_node;
     Input my_input;
+
+    using check_metainfo = std::is_same<BaseTaskType, graph_task>;
+    using without_metainfo = std::true_type;
+    using with_metainfo = std::false_type;
+
+    graph_task* call_apply_body_bypass_impl(without_metainfo) {
+        return my_node.apply_body_bypass(my_input
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* call_apply_body_bypass_impl(with_metainfo) {
+        return my_node.apply_body_bypass(my_input, message_metainfo{this->get_msg_wait_context_vertices()});
+    }
+#endif
+
+    graph_task* call_apply_body_bypass() {
+        return call_apply_body_bypass_impl(check_metainfo{});
+    }
+
 public:
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n, const Input &i,
+                            node_priority_t node_priority, Metainfo&& metainfo )
+        : BaseTaskType(g, allocator, node_priority, std::forward<Metainfo>(metainfo).waiters())
+        , my_node(n), my_input(i) {}
+#endif
 
-    apply_body_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n, const Input &i
-                            , node_priority_t node_priority = no_priority
-    ) : graph_task(g, allocator, node_priority),
-        my_node(n), my_input(i) {}
+    apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType& n, const Input& i,
+                            node_priority_t node_priority = no_priority )
+        : BaseTaskType(g, allocator, node_priority), my_node(n), my_input(i) {}
 
-    task* execute(execution_data& ed) override {
-        graph_task* next_task = my_node.apply_body_bypass( my_input );
+    d1::task* execute(d1::execution_data& ed) override {
+        graph_task* next_task = call_apply_body_bypass();
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
         else if (next_task)
             next_task = prioritize_task(my_node.graph_reference(), *next_task);
-        finalize<apply_body_task_bypass>(ed);
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
-        finalize<apply_body_task_bypass>(ed);
+    d1::task* cancel(d1::execution_data& ed) override {
+        BaseTaskType::template finalize<apply_body_task_bypass>(ed);
         return nullptr;
     }
 };
@@ -304,10 +332,10 @@ template< typename NodeType >
 class input_node_task_bypass : public graph_task {
     NodeType &my_node;
 public:
-    input_node_task_bypass( graph& g, small_object_allocator& allocator, NodeType &n )
+    input_node_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n )
         : graph_task(g, allocator), my_node(n) {}
 
-    task* execute(execution_data& ed) override {
+    d1::task* execute(d1::execution_data& ed) override {
         graph_task* next_task = my_node.apply_body_bypass( );
         if (SUCCESSFULLY_ENQUEUED == next_task)
             next_task = nullptr;
@@ -317,7 +345,7 @@ class input_node_task_bypass : public graph_task {
         return next_task;
     }
 
-    task* cancel(execution_data& ed) override {
+    d1::task* cancel(d1::execution_data& ed) override {
         finalize<input_node_task_bypass>(ed);
         return nullptr;
     }
@@ -343,6 +371,15 @@ class threshold_regulator<T, DecrementType,
         return result;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* try_put_task(const DecrementType& value, const message_metainfo&) override {
+        return try_put_task(value);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_node->my_graph;
     }
@@ -361,7 +398,14 @@ class threshold_regulator<T, continue_msg, void> : public continue_receiver, no_
 
     T *my_node;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    // Intentionally ignore the metainformation
+    // If there are more items associated with passed metainfo to be processed
+    // They should be stored in the buffer before the limiter_node
+    graph_task* execute(const message_metainfo&) override {
+#else
     graph_task* execute() override {
+#endif
         return my_node->decrement_counter( 1 );
     }
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_cache_impl.h b/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
index 059f198055..647f3dc1b6 100644
--- a/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_cache_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1 (in flow_graph.h)
+// included in namespace tbb::detail::d2 (in flow_graph.h)
 
 //! A node_cache maintains a std::queue of elements of type T.  Each operation is protected by a lock.
 template< typename T, typename M=spin_mutex >
@@ -98,9 +98,12 @@ class predecessor_cache : public node_cache< sender<T>, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool get_item( output_type& v ) {
+private:
+    bool get_item_impl( output_type& v
+                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo_ptr = nullptr) )
+    {
 
-        bool msg = false;
+        bool successful_get = false;
 
         do {
             predecessor_type *src;
@@ -113,18 +116,35 @@ class predecessor_cache : public node_cache< sender<T>, M > {
             }
 
             // Try to get from this sender
-            msg = src->try_get( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo_ptr) {
+                successful_get = src->try_get( v, *metainfo_ptr );
+            } else
+#endif
+            {
+                successful_get = src->try_get( v );
+            }
 
-            if (msg == false) {
+            if (successful_get == false) {
                 // Relinquish ownership of the edge
                 register_successor(*src, *my_owner);
             } else {
                 // Retain ownership of the edge
                 this->add(*src);
             }
-        } while ( msg == false );
-        return msg;
+        } while ( successful_get == false );
+        return successful_get;
     }
+public:
+    bool get_item( output_type& v ) {
+        return get_item_impl(v);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool get_item( output_type& v, message_metainfo& metainfo ) {
+        return get_item_impl(v, &metainfo);
+    }
+#endif
 
     // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
     void reset() {
@@ -157,8 +177,9 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
         // Do not work with the passed pointer here as it may not be fully initialized yet
     }
 
-    bool try_reserve( output_type &v ) {
-        bool msg = false;
+private:
+    bool try_reserve_impl( output_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo) ) {
+        bool successful_reserve = false;
 
         do {
             predecessor_type* pred = nullptr;
@@ -172,9 +193,16 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
             }
 
             // Try to get from this sender
-            msg = pred->try_reserve( v );
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (metainfo) {
+                successful_reserve = pred->try_reserve( v, *metainfo );
+            } else
+#endif
+            {
+                successful_reserve = pred->try_reserve( v );
+            }
 
-            if (msg == false) {
+            if (successful_reserve == false) {
                 typename mutex_type::scoped_lock lock(this->my_mutex);
                 // Relinquish ownership of the edge
                 register_successor( *pred, *this->my_owner );
@@ -183,11 +211,21 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
                 // Retain ownership of the edge
                 this->add( *pred);
             }
-        } while ( msg == false );
+        } while ( successful_reserve == false );
 
-        return msg;
+        return successful_reserve;
+    }
+public:
+    bool try_reserve( output_type& v ) {
+        return try_reserve_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(nullptr));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) {
+        return try_reserve_impl(v, &metainfo);
+    }
+#endif
+
     bool try_release() {
         reserved_src.load(std::memory_order_relaxed)->try_release();
         reserved_src.store(nullptr, std::memory_order_relaxed);
@@ -268,6 +306,9 @@ class successor_cache : no_copy {
     }
 
     virtual graph_task* try_put_task( const T& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache<T>
 
 //! An abstract cache of successors, specialized to continue_msg
@@ -327,6 +368,9 @@ class successor_cache< continue_msg, M > : no_copy {
     }
 
     virtual graph_task* try_put_task( const continue_msg& t ) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* try_put_task( const continue_msg& t, const message_metainfo& metainfo ) = 0;
+#endif
 };  // successor_cache< continue_msg >
 
 //! A cache of successors that are broadcast to
@@ -336,19 +380,12 @@ class broadcast_cache : public successor_cache<T, M> {
     typedef M mutex_type;
     typedef typename successor_cache<T,M>::successors_type successors_type;
 
-public:
-
-    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
-        // Do not work with the passed pointer here as it may not be fully initialized yet
-    }
-
-    // as above, but call try_put_task instead, and return the last task we received (if any)
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         graph_task * last_task = nullptr;
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task *new_task = (*i)->try_put_task(t);
+            graph_task *new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             // workaround for icc bug
             graph& graph_ref = (*i)->graph_reference();
             last_task = combine_tasks(graph_ref, last_task, new_task);  // enqueue if necessary
@@ -365,6 +402,21 @@ class broadcast_cache : public successor_cache<T, M> {
         }
         return last_task;
     }
+public:
+
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
+
+    graph_task* try_put_task( const T &t ) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T &t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 
     // call try_put_task and return list of received tasks
     bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
@@ -411,11 +463,15 @@ class round_robin_cache : public successor_cache<T, M> {
         return this->my_successors.size();
     }
 
-    graph_task* try_put_task( const T &t ) override {
+private:
+
+    graph_task* try_put_task_impl( const T &t
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) )
+    {
         typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
-            graph_task* new_task = (*i)->try_put_task(t);
+            graph_task* new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             if ( new_task ) {
                 return new_task;
             } else {
@@ -429,6 +485,17 @@ class round_robin_cache : public successor_cache<T, M> {
         }
         return nullptr;
     }
+
+public:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
 };
 
 #endif // __TBB__flow_graph_cache_impl_H
diff --git a/include/oneapi/tbb/detail/_flow_graph_impl.h b/include/oneapi/tbb/detail/_flow_graph_impl.h
index 8207667f37..55063b93e1 100644
--- a/include/oneapi/tbb/detail/_flow_graph_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 class graph_task;
 static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1;
@@ -123,32 +123,98 @@ void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 class graph;
 
 //! Base class for tasks generated by graph nodes.
-class graph_task : public task {
+class graph_task : public d1::task {
 public:
-    graph_task(graph& g, small_object_allocator& allocator
-               , node_priority_t node_priority = no_priority
-    )
-        : my_graph(g)
-        , priority(node_priority)
-        , my_allocator(allocator)
-    {}
+    graph_task(graph& g, d1::small_object_allocator& allocator,
+               node_priority_t node_priority = no_priority);
+
     graph& my_graph; // graph instance the task belongs to
     // TODO revamp: rename to my_priority
     node_priority_t priority;
     template <typename DerivedType>
-    void destruct_and_deallocate(const execution_data& ed);
+    void destruct_and_deallocate(const d1::execution_data& ed);
 protected:
     template <typename DerivedType>
-    void finalize(const execution_data& ed);
+    void finalize(const d1::execution_data& ed);
 private:
     // To organize task_list
     graph_task* my_next{ nullptr };
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
+    d1::wait_tree_vertex_interface* my_reference_vertex;
     // TODO revamp: elaborate internal interfaces to avoid friends declarations
     friend class graph_task_list;
     friend graph_task* prioritize_task(graph& g, graph_task& gt);
 };
 
+inline bool is_this_thread_in_graph_arena(graph& g);
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class trackable_messages_graph_task : public graph_task {
+public:
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  const std::forward_list<d1::wait_context_vertex*>& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(msg_waiters)
+    {
+        auto last_iterator = my_msg_reference_vertices.cbefore_begin();
+
+        for (auto& msg_waiter : my_msg_wait_context_vertices) {
+            // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+            // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+            // and the task should be associated with the msg wait context itself
+            d1::wait_tree_vertex_interface* ref_vertex = is_this_thread_in_graph_arena(g) ?
+                                                         r1::get_thread_reference_vertex(msg_waiter) :
+                                                         msg_waiter;
+            last_iterator = my_msg_reference_vertices.emplace_after(last_iterator,
+                                                                    ref_vertex);
+            ref_vertex->reserve(1);
+        }
+    }
+
+    trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator,
+                                  node_priority_t node_priority,
+                                  std::forward_list<d1::wait_context_vertex*>&& msg_waiters)
+        : graph_task(g, allocator, node_priority)
+        , my_msg_wait_context_vertices(std::move(msg_waiters))
+    {
+    }
+
+    const std::forward_list<d1::wait_context_vertex*> get_msg_wait_context_vertices() const {
+        return my_msg_wait_context_vertices;
+    }
+
+protected:
+    template <typename DerivedType>
+    void finalize(const d1::execution_data& ed) {
+        auto wait_context_vertices = std::move(my_msg_wait_context_vertices);
+        auto msg_reference_vertices = std::move(my_msg_reference_vertices);
+        graph_task::finalize<DerivedType>(ed);
+
+        // If there is no thread reference vertices associated with the task
+        // then this task was created by transferring the ownership from other metainfo
+        // instance (e.g. while taking from the buffer)
+        if (msg_reference_vertices.empty()) {
+            for (auto& msg_waiter : wait_context_vertices) {
+                msg_waiter->release(1);
+            }
+        } else {
+            for (auto& msg_waiter : msg_reference_vertices) {
+                msg_waiter->release(1);
+            }
+        }
+    }
+private:
+    // Each task that holds information about single message wait_contexts should hold two lists
+    // The first one is wait_contexts associated with the message itself. They are needed
+    // to be able to broadcast the list of wait_contexts to the node successors while executing the task.
+    // The second list is a list of reference vertices for each wait_context_vertex in the first list
+    // to support the distributed reference counting schema
+    std::forward_list<d1::wait_context_vertex*> my_msg_wait_context_vertices;
+    std::forward_list<d1::wait_tree_vertex_interface*> my_msg_reference_vertices;
+}; // class trackable_messages_graph_task
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 struct graph_task_comparator {
     bool operator()(const graph_task* left, const graph_task* right) {
         return left->priority < right->priority;
@@ -157,18 +223,18 @@ struct graph_task_comparator {
 
 typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph_task_priority_queue_t;
 
-class priority_task_selector : public task {
+class priority_task_selector : public d1::task {
 public:
-    priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
+    priority_task_selector(graph_task_priority_queue_t& priority_queue, d1::small_object_allocator& allocator)
         : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
-    task* execute(execution_data& ed) override {
+    task* execute(d1::execution_data& ed) override {
         next_task();
         __TBB_ASSERT(my_task, nullptr);
         task* t_next = my_task->execute(ed);
         my_allocator.delete_object(this, ed);
         return t_next;
     }
-    task* cancel(execution_data& ed) override {
+    task* cancel(d1::execution_data& ed) override {
         if (!my_task) {
             next_task();
         }
@@ -190,7 +256,7 @@ class priority_task_selector : public task {
     }
 
     graph_task_priority_queue_t& my_priority_queue;
-    small_object_allocator my_allocator;
+    d1::small_object_allocator my_allocator;
     graph_task* my_task;
 };
 
@@ -281,7 +347,7 @@ class graph : no_copy, public graph_proxy {
         caught_exception = false;
         try_call([this] {
             my_task_arena->execute([this] {
-                wait(my_wait_context, *my_context);
+                d1::wait(my_wait_context_vertex.get_context(), *my_context);
             });
             cancelled = my_context->is_group_execution_cancelled();
         }).on_exception([this] {
@@ -332,7 +398,7 @@ class graph : no_copy, public graph_proxy {
     bool exception_thrown() { return caught_exception; }
 
 private:
-    wait_context my_wait_context;
+    d1::wait_context_vertex my_wait_context_vertex;
     task_group_context *my_context;
     bool own_context;
     bool cancelled;
@@ -349,19 +415,25 @@ class graph : no_copy, public graph_proxy {
 
     graph_task_priority_queue_t my_priority_queue;
 
+    d1::wait_context_vertex& get_wait_context_vertex() { return my_wait_context_vertex; }
+
     friend void activate_graph(graph& g);
     friend void deactivate_graph(graph& g);
     friend bool is_graph_active(graph& g);
+    friend bool is_this_thread_in_graph_arena(graph& g);
     friend graph_task* prioritize_task(graph& g, graph_task& arena_task);
     friend void spawn_in_graph_arena(graph& g, graph_task& arena_task);
     friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task);
 
-    friend class task_arena_base;
+    friend class d1::task_arena_base;
+    friend class graph_task;
 
+    template <typename T>
+    friend class receiver;
 };  // class graph
 
 template<typename DerivedType>
-inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
+inline void graph_task::destruct_and_deallocate(const d1::execution_data& ed) {
     auto allocator = my_allocator;
     // TODO: investigate if direct call of derived destructor gives any benefits.
     this->~graph_task();
@@ -369,10 +441,27 @@ inline void graph_task::destruct_and_deallocate(const execution_data& ed) {
 }
 
 template<typename DerivedType>
-inline void graph_task::finalize(const execution_data& ed) {
-    graph& g = my_graph;
+inline void graph_task::finalize(const d1::execution_data& ed) {
+    d1::wait_tree_vertex_interface* reference_vertex = my_reference_vertex;
     destruct_and_deallocate<DerivedType>(ed);
-    g.release_wait();
+    reference_vertex->release();
+}
+
+inline graph_task::graph_task(graph& g, d1::small_object_allocator& allocator,
+                              node_priority_t node_priority)
+    : my_graph(g)
+    , priority(node_priority)
+    , my_allocator(allocator)
+{
+    // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex
+    // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used
+    // and the task should be associated with the graph wait context itself
+    // TODO: consider how reference counting can be improved for such a use case. Most common example is the async_node
+    d1::wait_context_vertex* graph_wait_context_vertex = &my_graph.get_wait_context_vertex();
+    my_reference_vertex = is_this_thread_in_graph_arena(g) ? r1::get_thread_reference_vertex(graph_wait_context_vertex)
+                                                           : graph_wait_context_vertex;
+    __TBB_ASSERT(my_reference_vertex, nullptr);
+    my_reference_vertex->reserve();
 }
 
 //********************************************************************************
@@ -424,15 +513,20 @@ inline bool is_graph_active(graph& g) {
     return g.my_is_active;
 }
 
+inline bool is_this_thread_in_graph_arena(graph& g) {
+    __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr);
+    return r1::execution_slot(*g.my_task_arena) != d1::slot_id(-1);
+}
+
 inline graph_task* prioritize_task(graph& g, graph_task& gt) {
     if( no_priority == gt.priority )
         return &gt;
 
     //! Non-preemptive priority pattern. The original task is submitted as a work item to the
     //! priority queue, and a new critical task is created to take and execute a work item with
-    //! the highest known priority. The reference counting responsibility is transferred (via
-    //! allocate_continuation) to the new task.
-    task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
+    //! the highest known priority. The reference counting responsibility is transferred to
+    //! the new task.
+    d1::task* critical_task = gt.my_allocator.new_object<priority_task_selector>(g.my_priority_queue, gt.my_allocator);
     __TBB_ASSERT( critical_task, "bad_alloc?" );
     g.my_priority_queue.push(&gt);
     using tbb::detail::d1::submit;
@@ -443,7 +537,7 @@ inline graph_task* prioritize_task(graph& g, graph_task& gt) {
 //! Spawns a task inside graph arena
 inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) {
     if (is_graph_active(g)) {
-        task* gt = prioritize_task(g, arena_task);
+        d1::task* gt = prioritize_task(g, arena_task);
         if( !gt )
             return;
 
@@ -464,12 +558,12 @@ inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) {
         __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" );
 
         // TODO revamp: decide on the approach that does not postpone critical task
-        if( task* gt = prioritize_task(g, arena_task) )
+        if( d1::task* gt = prioritize_task(g, arena_task) )
             submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false);
     }
 }
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h b/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
index f4f55a6c7a..a743310079 100644
--- a/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 #include "_flow_graph_types_impl.h"
 
@@ -31,9 +31,9 @@
     // successor.
 
     template<typename IndexerNodeBaseType, typename T, size_t K>
-    graph_task* do_try_put(const T &v, void *p) {
+    graph_task* do_try_put(const T &v, void *p __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         typename IndexerNodeBaseType::output_type o(K, v);
-        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o);
+        return reinterpret_cast<IndexerNodeBaseType *>(p)->try_put_task(&o __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     template<typename TupleTypes,int N>
@@ -41,7 +41,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<N-1, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, N-1>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, N-1>;
             std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
             indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
         }
@@ -52,7 +52,7 @@
         template<typename IndexerNodeBaseType, typename PortTuple>
         static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) {
             typedef typename std::tuple_element<0, TupleTypes>::type T;
-            graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
+            auto indexer_node_put_task = do_try_put<IndexerNodeBaseType, T, 0>;
             std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
         }
     };
@@ -61,7 +61,8 @@
     class indexer_input_port : public receiver<T> {
     private:
         void* my_indexer_ptr;
-        typedef graph_task* (* forward_function_ptr)(T const &, void* );
+        typedef graph_task* (* forward_function_ptr)(T const &, void*
+                                                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&));
         forward_function_ptr my_try_put_task;
         graph* my_graph;
     public:
@@ -76,9 +77,15 @@
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
         graph_task* try_put_task(const T &v) override {
-            return my_try_put_task(v, my_indexer_ptr);
+            return my_try_put_task(v, my_indexer_ptr __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return my_try_put_task(v, my_indexer_ptr, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return *my_graph;
         }
@@ -118,7 +125,7 @@
         };
         typedef indexer_node_base<InputTuple,output_type,StructTypes> class_type;
 
-        class indexer_node_base_operation : public aggregated_operation<indexer_node_base_operation> {
+        class indexer_node_base_operation : public d1::aggregated_operation<indexer_node_base_operation> {
         public:
             char type;
             union {
@@ -126,15 +133,23 @@
                 successor_type *my_succ;
                 graph_task* bypass_t;
             };
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo const* metainfo;
+#endif
             indexer_node_base_operation(const output_type* e, op_type t) :
-                type(char(t)), my_arg(e) {}
+                type(char(t)), my_arg(e) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            indexer_node_base_operation(const output_type* e, op_type t, const message_metainfo& info)
+                : type(char(t)), my_arg(e), metainfo(&info) {}
+#endif
             indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)) {}
         };
 
-        typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, indexer_node_base_operation>;
-        aggregator<handler_type, indexer_node_base_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, indexer_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, indexer_node_base_operation>;
+        d1::aggregator<handler_type, indexer_node_base_operation> my_aggregator;
 
         void handle_operations(indexer_node_base_operation* op_list) {
             indexer_node_base_operation *current;
@@ -153,7 +168,8 @@
                     current->status.store( SUCCEEDED, std::memory_order_release);
                     break;
                 case try__put_task: {
-                        current->bypass_t = my_successors.try_put_task(*(current->my_arg));
+                        current->bypass_t = my_successors.try_put_task(*(current->my_arg)
+                                                                       __TBB_FLOW_GRAPH_METAINFO_ARG(*(current->metainfo)));
                         current->status.store( SUCCEEDED, std::memory_order_release);  // return of try_put_task actual return value
                     }
                     break;
@@ -186,8 +202,11 @@
             return op_data.status == SUCCEEDED;
         }
 
-        graph_task* try_put_task(output_type const *v) { // not a virtual method in this class
-            indexer_node_base_operation op_data(v, try__put_task);
+        // not a virtual method in this class
+        graph_task* try_put_task(output_type const *v
+                                 __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+        {
+            indexer_node_base_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             return op_data.bypass_t;
         }
diff --git a/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h b/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
index 423033b1d5..cf7c54b852 100644
--- a/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -37,8 +37,14 @@ class item_buffer {
     typedef T item_type;
     enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 };
 protected:
+    struct aligned_space_item {
+        item_type item;
+        buffer_item_state state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+    };
     typedef size_t size_type;
-    typedef std::pair<item_type, buffer_item_state> aligned_space_item;
     typedef aligned_space<aligned_space_item> buffer_item_type;
     typedef typename allocator_traits<A>::template rebind_alloc<buffer_item_type> allocator_type;
     buffer_item_type *my_array;
@@ -49,45 +55,89 @@ class item_buffer {
 
     bool buffer_empty() const { return my_head == my_tail; }
 
-    aligned_space_item &item(size_type i) {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    aligned_space_item &element(size_type i) {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size - 1) ].begin();
     }
 
-    const aligned_space_item &item(size_type i) const {
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->second))%alignment_of<buffer_item_state>::value), nullptr);
-        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->first))%alignment_of<item_type>::value), nullptr);
+    const aligned_space_item &element(size_type i) const {
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of<buffer_item_state>::value), nullptr);
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of<item_type>::value), nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of<message_metainfo>::value), nullptr);
+#endif
         return *my_array[i & (my_array_size-1)].begin();
     }
 
-    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+    bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (element(i).state != no_item); }
 #if TBB_USE_ASSERT
-    bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+    bool my_item_reserved(size_type i) const { return element(i).state == reserved_item; }
 #endif
 
     // object management in buffer
     const item_type &get_my_item(size_t i) const {
         __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item");
-        item_type* itm = const_cast<item_type*>(reinterpret_cast<const item_type*>(&item(i).first));
-        return *itm;
+        return element(i).item;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& get_my_metainfo(size_t i) {
+        __TBB_ASSERT(my_item_valid(i), "attempt to get invalid item");
+        return element(i).metainfo;
+    }
+#endif
+
     // may be called with an empty slot or a slot that has already been constructed into.
-    void set_my_item(size_t i, const item_type &o) {
-        if(item(i).second != no_item) {
+    void set_my_item(size_t i, const item_type &o
+                     __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if(element(i).state != no_item) {
             destroy_item(i);
         }
-        new(&(item(i).first)) item_type(o);
-        item(i).second = has_item;
+        new(&(element(i).item)) item_type(o);
+        element(i).state = has_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        new(&element(i).metainfo) message_metainfo(metainfo);
+
+        for (auto& waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+#endif
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void set_my_item(size_t i, const item_type& o, message_metainfo&& metainfo) {
+        if(element(i).state != no_item) {
+            destroy_item(i);
+        }
+
+        new(&(element(i).item)) item_type(o);
+        new(&element(i).metainfo) message_metainfo(std::move(metainfo));
+        // Skipping the reservation on metainfo.waiters since the ownership
+        // is moving from metainfo to the cache
+        element(i).state = has_item;
     }
+#endif
 
     // destructively-fetch an object from the buffer
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    void fetch_item(size_t i, item_type& o, message_metainfo& metainfo) {
+        __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
+        o = get_my_item(i);  // could have std::move assign semantics
+        metainfo = std::move(get_my_metainfo(i));
+        destroy_item(i);
+    }
+#else
     void fetch_item(size_t i, item_type &o) {
         __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot");
         o = get_my_item(i);  // could have std::move assign semantics
         destroy_item(i);
     }
+#endif
 
     // move an existing item from one slot to another.  The moved-to slot must be unoccupied,
     // the moved-from slot must exist and not be reserved.  The after, from will be empty,
@@ -95,12 +145,22 @@ class item_buffer {
     void move_item(size_t to, size_t from) {
         __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot");
         __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot");
-        set_my_item(to, get_my_item(from));   // could have std::move semantics
+        // could have std::move semantics
+        set_my_item(to, get_my_item(from) __TBB_FLOW_GRAPH_METAINFO_ARG(get_my_metainfo(from)));
         destroy_item(from);
-
     }
 
     // put an item in an empty slot.  Return true if successful, else false
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    bool place_item(size_t here, const item_type &me, Metainfo&& metainfo) {
+#if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
+        if(my_item_valid(here)) return false;
+#endif
+        set_my_item(here, me, std::forward<Metainfo>(metainfo));
+        return true;
+    }
+#else
     bool place_item(size_t here, const item_type &me) {
 #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES
         if(my_item_valid(here)) return false;
@@ -108,19 +168,36 @@ class item_buffer {
         set_my_item(here, me);
         return true;
     }
+#endif
 
     // could be implemented with std::move semantics
     void swap_items(size_t i, size_t j) {
         __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)");
         item_type temp = get_my_item(i);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo temp_metainfo = get_my_metainfo(i);
+        set_my_item(i, get_my_item(j), get_my_metainfo(j));
+        set_my_item(j, temp, temp_metainfo);
+#else
         set_my_item(i, get_my_item(j));
         set_my_item(j, temp);
+#endif
     }
 
     void destroy_item(size_type i) {
         __TBB_ASSERT(my_item_valid(i), "destruction of invalid item");
-        item(i).first.~item_type();
-        item(i).second = no_item;
+
+        auto& e = element(i);
+        e.item.~item_type();
+        e.state = no_item;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto& msg_waiter : e.metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        e.metainfo.~message_metainfo();
+#endif
     }
 
     // returns the front element
@@ -130,6 +207,14 @@ class item_buffer {
         return get_my_item(my_head);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const
+    {
+        __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item");
+        return element(my_head).metainfo;
+    }
+#endif
+
     // returns  the back element
     const item_type& back() const
     {
@@ -137,9 +222,23 @@ class item_buffer {
         return get_my_item(my_tail - 1);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& back_metainfo() const {
+        __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item");
+        return element(my_tail - 1).metainfo;
+    }
+#endif
+
     // following methods are for reservation of the front of a buffer.
-    void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); item(i).second = reserved_item; }
-    void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); item(i).second = has_item; }
+    void reserve_item(size_type i) {
+        __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved");
+        element(i).state = reserved_item;
+    }
+
+    void release_item(size_type i) {
+        __TBB_ASSERT(my_item_reserved(i), "item is not reserved");
+        element(i).state = has_item;
+    }
 
     void destroy_front() { destroy_item(my_head); ++my_head; }
     void destroy_back() { destroy_item(my_tail-1); --my_tail; }
@@ -163,14 +262,18 @@ class item_buffer {
         buffer_item_type* new_array = allocator_type().allocate(new_size);
 
         // initialize validity to "no"
-        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->second = no_item; }
+        for( size_type i=0; i<new_size; ++i ) { new_array[i].begin()->state = no_item; }
 
         for( size_type i=my_head; i<my_tail; ++i) {
             if(my_item_valid(i)) {  // sequencer_node may have empty slots
                 // placement-new copy-construct; could be std::move
-                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->first);
+                char *new_space = (char *)&(new_array[i&(new_size-1)].begin()->item);
                 (void)new(new_space) item_type(get_my_item(i));
-                new_array[i&(new_size-1)].begin()->second = item(i).second;
+                new_array[i&(new_size-1)].begin()->state = element(i).state;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                char* meta_space = (char *)&(new_array[i&(new_size-1)].begin()->metainfo);
+                ::new(meta_space) message_metainfo(std::move(element(i).metainfo));
+#endif
             }
         }
 
@@ -180,33 +283,61 @@ class item_buffer {
         my_array_size = new_size;
     }
 
-    bool push_back(item_type &v) {
-        if(buffer_full()) {
+    bool push_back(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if (buffer_full()) {
             grow_my_array(size() + 1);
         }
-        set_my_item(my_tail, v);
+        set_my_item(my_tail, v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++my_tail;
         return true;
     }
 
-    bool pop_back(item_type &v) {
-        if (!my_item_valid(my_tail-1)) {
+    bool pop_back(item_type& v
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_tail - 1)) {
             return false;
         }
-        v = this->back();
+        auto& e = element(my_tail - 1);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_back();
         return true;
     }
 
-    bool pop_front(item_type &v) {
-        if(!my_item_valid(my_head)) {
+    bool pop_front(item_type& v
+                   __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo))
+    {
+        if (!my_item_valid(my_head)) {
             return false;
         }
-        v = this->front();
+        auto& e = element(my_head);
+        v = e.item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        metainfo = std::move(e.metainfo);
+#endif
+
         destroy_front();
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool pop_back(item_type& v) {
+        message_metainfo metainfo;
+        return pop_back(v, metainfo);
+    }
+
+    bool pop_front(item_type& v) {
+        message_metainfo metainfo;
+        return pop_front(v, metainfo);
+    }
+#endif
+
     // This is used both for reset and for grow_my_array.  In the case of grow_my_array
     // we want to retain the values of the head and tail.
     void clean_up_buffer(bool reset_pointers) {
@@ -261,6 +392,18 @@ class reservable_item_buffer : public item_buffer<T, A> {
         return true;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool reserve_front(T& v, message_metainfo& metainfo) {
+        if (my_reserved || !my_item_valid(this->my_head)) return false;
+        my_reserved = true;
+        // reserving the head
+        v = this->front();
+        metainfo = this->front_metainfo();
+        this->reserve_item(this->my_head);
+        return true;
+    }
+#endif
+
     void consume_front() {
         __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item");
         this->destroy_front();
diff --git a/include/oneapi/tbb/detail/_flow_graph_join_impl.h b/include/oneapi/tbb/detail/_flow_graph_join_impl.h
index 5515421ede..8bca9a2c41 100644
--- a/include/oneapi/tbb/detail/_flow_graph_join_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_join_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included into namespace tbb::detail::d1
+// included into namespace tbb::detail::d2
 
     struct forwarding_base : no_assign {
         forwarding_base(graph &g) : graph_ref(g) {}
@@ -89,17 +89,49 @@
             return true;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            if (!std::get<N - 1>(my_input).reserve(std::get<N - 1>(out), element_metainfo)) return false;
+            if (!join_helper<N - 1>::reserve(my_input, out, metainfo)) {
+                release_my_reservation(my_input);
+                return false;
+            }
+            metainfo.merge(element_metainfo);
+            return true;
+
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out) ); // may fail
             return join_helper<N-1>::get_my_item(my_input, out) && res;       // do get on other inputs before returning
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<N-1>(my_input).get_item(std::get<N-1>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return join_helper<N-1>::get_my_item(my_input, out, metainfo) && res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             join_helper<N-1>::reset_my_port(my_input);
@@ -163,16 +195,43 @@
             return std::get<0>( my_input ).reserve( std::get<0>( out ) );
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool result = std::get<0>(my_input).reserve(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return result;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) {
             return std::get<0>(my_input).get_item(std::get<0>(out));
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            message_metainfo element_metainfo;
+            bool res = std::get<0>(my_input).get_item(std::get<0>(out), element_metainfo);
+            metainfo.merge(element_metainfo);
+            return res;
+        }
+#endif
+
         template<typename InputTuple, typename OutputTuple>
         static inline bool get_items(InputTuple &my_input, OutputTuple &out) {
             return get_my_item(my_input, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        template <typename InputTuple, typename OutputTuple>
+        static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) {
+            return get_my_item(my_input, out, metainfo);
+        }
+#endif
+
         template<typename InputTuple>
         static inline void reset_my_port(InputTuple &my_input) {
             std::get<0>(my_input).reset_port();
@@ -216,23 +275,31 @@
         };
         typedef reserving_port<T> class_type;
 
-        class reserving_port_operation : public aggregated_operation<reserving_port_operation> {
+        class reserving_port_operation : public d1::aggregated_operation<reserving_port_operation> {
         public:
             char type;
             union {
                 T *my_arg;
                 predecessor_type *my_pred;
             };
-            reserving_port_operation(const T& e, op_type t) :
-                type(char(t)), my_arg(const_cast<T*>(&e)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            reserving_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
+                type(char(t)), my_arg(const_cast<T*>(&e))
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            reserving_port_operation(const T& e, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(&e)), metainfo(nullptr) {}
+#endif
             reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)),
                 my_pred(const_cast<predecessor_type *>(&s)) {}
             reserving_port_operation(op_type t) : type(char(t)) {}
         };
 
-        typedef aggregating_functor<class_type, reserving_port_operation> handler_type;
-        friend class aggregating_functor<class_type, reserving_port_operation>;
-        aggregator<handler_type, reserving_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, reserving_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, reserving_port_operation>;
+        d1::aggregator<handler_type, reserving_port_operation> my_aggregator;
 
         void handle_operations(reserving_port_operation* op_list) {
             reserving_port_operation *current;
@@ -262,14 +329,26 @@
                     if ( reserved ) {
                         current->status.store( FAILED, std::memory_order_release);
                     }
-                    else if ( my_predecessors.try_reserve( *(current->my_arg) ) ) {
-                        reserved = true;
-                        current->status.store( SUCCEEDED, std::memory_order_release);
-                    } else {
-                        if ( my_predecessors.empty() ) {
-                            my_join->increment_port_count();
+                    else {
+                        bool reserve_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg),
+                                                                         *(current->metainfo));
+                        } else
+#endif
+                        {
+                            reserve_result = my_predecessors.try_reserve(*(current->my_arg));
+                        }
+                        if (reserve_result) {
+                            reserved = true;
+                            current->status.store( SUCCEEDED, std::memory_order_release);
+                        } else {
+                            if ( my_predecessors.empty() ) {
+                                my_join->increment_port_count();
+                            }
+                            current->status.store( FAILED, std::memory_order_release);
                         }
-                        current->status.store( FAILED, std::memory_order_release);
                     }
                     break;
                 case rel_res:
@@ -294,6 +373,10 @@
             return nullptr;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T&, const message_metainfo&) override { return nullptr; }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -333,6 +416,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve( T& v, message_metainfo& metainfo ) {
+            reserving_port_operation op_data(v, res_item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         //! Release the port
         void release( ) {
             reserving_port_operation op_data(rel_res);
@@ -376,31 +467,42 @@
         enum op_type { get__item, res_port, try__put_task
         };
 
-        class queueing_port_operation : public aggregated_operation<queueing_port_operation> {
+        class queueing_port_operation : public d1::aggregated_operation<queueing_port_operation> {
         public:
             char type;
             T my_val;
             T* my_arg;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
             // constructor for value parameter
-            queueing_port_operation(const T& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr)
+            queueing_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info)))
             {}
             // constructor for pointer parameter
-            queueing_port_operation(const T* p, op_type t) :
+            queueing_port_operation(const T* p, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) :
                 type(char(t)), my_arg(const_cast<T*>(p))
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info))
+            {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            queueing_port_operation(const T* p, op_type t)
+                : type(char(t)), my_arg(const_cast<T*>(p)), bypass_t(nullptr), metainfo(nullptr)
             {}
+#endif
             // constructor with no parameter
             queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr)
                 , bypass_t(nullptr)
+                __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr))
             {}
         };
 
-        typedef aggregating_functor<class_type, queueing_port_operation> handler_type;
-        friend class aggregating_functor<class_type, queueing_port_operation>;
-        aggregator<handler_type, queueing_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, queueing_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, queueing_port_operation>;
+        d1::aggregator<handler_type, queueing_port_operation> my_aggregator;
 
         void handle_operations(queueing_port_operation* op_list) {
             queueing_port_operation *current;
@@ -412,7 +514,12 @@
                 case try__put_task: {
                         graph_task* rtask = nullptr;
                         was_empty = this->buffer_empty();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        this->push_back(current->my_val, *(current->metainfo));
+#else
                         this->push_back(current->my_val);
+#endif
                         if (was_empty) rtask = my_join->decrement_port_count(false);
                         else
                             rtask = SUCCESSFULLY_ENQUEUED;
@@ -424,6 +531,11 @@
                     if(!this->buffer_empty()) {
                         __TBB_ASSERT(current->my_arg, nullptr);
                         *(current->my_arg) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     else {
@@ -447,14 +559,27 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const T &v) override {
-            queueing_port_operation op_data(v, try__put_task);
+
+    private:
+        graph_task* try_put_task_impl(const T& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            queueing_port_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             my_aggregator.execute(&op_data);
             __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator");
             if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED;
             return op_data.bypass_t;
         }
 
+    protected:
+        graph_task* try_put_task(const T &v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
+
         graph& graph_reference() const override {
             return my_join->graph_ref;
         }
@@ -481,6 +606,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( T& v, message_metainfo& metainfo ) {
+            queueing_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -517,13 +650,23 @@
         const K& operator()(const table_item_type& v) { return v.my_key; }
     };
 
+    template <typename K, typename T, typename TtoK, typename KHash>
+    struct key_matching_port_base {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        using type = metainfo_hash_buffer<K, T, TtoK, KHash>;
+#else
+        using type = hash_buffer<K, T, TtoK, KHash>;
+#endif
+    };
+
     // the ports can have only one template parameter.  We wrap the types needed in
     // a traits type
     template< class TraitsType >
     class key_matching_port :
         public receiver<typename TraitsType::T>,
-        public hash_buffer< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
-                typename TraitsType::KHash > {
+        public key_matching_port_base< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK,
+                                       typename TraitsType::KHash >::type
+    {
     public:
         typedef TraitsType traits;
         typedef key_matching_port<traits> class_type;
@@ -533,7 +676,7 @@
         typedef typename receiver<input_type>::predecessor_type predecessor_type;
         typedef typename TraitsType::TtoK type_to_key_func_type;
         typedef typename TraitsType::KHash hash_compare_type;
-        typedef hash_buffer< key_type, input_type, type_to_key_func_type, hash_compare_type > buffer_type;
+        typedef typename key_matching_port_base<key_type, input_type, type_to_key_func_type, hash_compare_type>::type buffer_type;
 
     private:
 // ----------- Aggregator ------------
@@ -541,24 +684,33 @@
         enum op_type { try__put, get__item, res_port
         };
 
-        class key_matching_port_operation : public aggregated_operation<key_matching_port_operation> {
+        class key_matching_port_operation : public d1::aggregated_operation<key_matching_port_operation> {
         public:
             char type;
             input_type my_val;
             input_type *my_arg;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
-            key_matching_port_operation(const input_type& e, op_type t) :
-                type(char(t)), my_val(e), my_arg(nullptr) {}
+            key_matching_port_operation(const input_type& e, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info))
+                : type(char(t)), my_val(e), my_arg(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast<message_metainfo*>(&info))) {}
+
             // constructor for pointer parameter
-            key_matching_port_operation(const input_type* p, op_type t) :
-                type(char(t)), my_arg(const_cast<input_type*>(p)) {}
+            key_matching_port_operation(const input_type* p, op_type t
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<input_type*>(p))
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+
             // constructor with no parameter
             key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_port_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_port_operation>;
-        aggregator<handler_type, key_matching_port_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_port_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_port_operation>;
+        d1::aggregator<handler_type, key_matching_port_operation> my_aggregator;
 
         void handle_operations(key_matching_port_operation* op_list) {
             key_matching_port_operation *current;
@@ -567,18 +719,35 @@
                 op_list = op_list->next;
                 switch(current->type) {
                 case try__put: {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        __TBB_ASSERT(current->metainfo, nullptr);
+                        bool was_inserted = this->insert_with_key(current->my_val, *(current->metainfo));
+#else
                         bool was_inserted = this->insert_with_key(current->my_val);
+#endif
                         // return failure if a duplicate insertion occurs
                         current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release);
                     }
                     break;
-                case get__item:
+                case get__item: {
                     // use current_key from FE for item
                     __TBB_ASSERT(current->my_arg, nullptr);
-                    if(!this->find_with_key(my_join->current_key, *(current->my_arg))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                    __TBB_ASSERT(current->metainfo, nullptr);
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg),
+                                                           *(current->metainfo));
+#else
+                    bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg));
+#endif
+#if TBB_USE_DEBUG
+                    if (!find_result) {
                         __TBB_ASSERT(false, "Failed to find item corresponding to current_key.");
                     }
+#else
+                    tbb::detail::suppress_unused_warning(find_result);
+#endif
                     current->status.store( SUCCEEDED, std::memory_order_release);
+                    }
                     break;
                 case res_port:
                     // use current_key from FE for item
@@ -593,17 +762,28 @@
         template< typename R, typename B > friend class run_and_put_task;
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
-        graph_task* try_put_task(const input_type& v) override {
-            key_matching_port_operation op_data(v, try__put);
+    private:
+        graph_task* try_put_task_impl(const input_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+            key_matching_port_operation op_data(v, try__put __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             graph_task* rtask = nullptr;
             my_aggregator.execute(&op_data);
             if(op_data.status == SUCCEEDED) {
-                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v)); // may spawn
                 // rtask has to reflect the return status of the try_put
                 if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
             }
             return rtask;
         }
+    protected:
+        graph_task* try_put_task(const input_type& v) override {
+            return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+        }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+            return try_put_task_impl(v, metainfo);
+        }
+#endif
 
         graph& graph_reference() const override {
             return my_join->graph_ref;
@@ -640,6 +820,15 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool get_item( input_type& v, message_metainfo& metainfo ) {
+            // aggregator uses current_key from FE for Key
+            key_matching_port_operation op_data(&v, get__item, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         // reset_port is called when item is accepted by successor, but
         // is initiated by join_node.
         void reset_port() {
@@ -695,10 +884,9 @@
         graph_task* decrement_port_count() override {
             if(ports_with_no_inputs.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     spawn_in_graph_arena(this->graph_ref, *t);
                 }
             }
@@ -726,6 +914,13 @@
             return join_helper<N>::reserve(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if (ports_with_no_inputs) return false;
+            return join_helper<N>::reserve(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             join_helper<N>::consume_reservations(my_inputs);
         }
@@ -768,10 +963,9 @@
         {
             if(ports_with_no_items.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
-                    graph_ref.reserve_wait();
                     if( !handle_task )
                         return t;
                     spawn_in_graph_arena(this->graph_ref, *t);
@@ -800,6 +994,13 @@
             return join_helper<N>::get_items(my_inputs, out);
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            if(ports_with_no_items) return false;
+            return join_helper<N>::get_items(my_inputs, out, metainfo);
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();
             join_helper<N>::reset_ports(my_inputs);
@@ -854,23 +1055,30 @@
         enum op_type { res_count, inc_count, may_succeed, try_make };
         typedef join_node_FE<key_matching<key_type,key_hash_compare>, InputTuple, OutputTuple> class_type;
 
-        class key_matching_FE_operation : public aggregated_operation<key_matching_FE_operation> {
+        class key_matching_FE_operation : public d1::aggregated_operation<key_matching_FE_operation> {
         public:
             char type;
             unref_key_type my_val;
             output_type* my_output;
             graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo = nullptr;
+#endif
             // constructor for value parameter
             key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
                  my_output(nullptr), bypass_t(nullptr) {}
             key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            key_matching_FE_operation(output_type *p, op_type t, message_metainfo& info)
+                : type(char(t)), my_output(p), bypass_t(nullptr), metainfo(&info) {}
+#endif
             // constructor with no parameter
             key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
-        friend class aggregating_functor<class_type, key_matching_FE_operation>;
-        aggregator<handler_type, key_matching_FE_operation> my_aggregator;
+        typedef d1::aggregating_functor<class_type, key_matching_FE_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, key_matching_FE_operation>;
+        d1::aggregator<handler_type, key_matching_FE_operation> my_aggregator;
 
         // called from aggregator, so serialized
         // returns a task pointer if the a task would have been enqueued but we asked that
@@ -881,13 +1089,15 @@
             bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
             this->current_key = t;
             this->delete_with_key(this->current_key);   // remove the key
-            if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
-                this->push_back(l_out);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(join_helper<N>::get_items(my_inputs, l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {  //  <== call back
+                this->push_back(l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                 if(do_fwd) {  // we enqueue if receiving an item from predecessor, not if successor asks for item
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
-                    this->graph_ref.reserve_wait();
                     do_fwd = false;
                 }
                 // retire the input values
@@ -937,6 +1147,11 @@
                     }
                     else {
                         *(current->my_output) = this->front();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            *(current->metainfo) = this->front_metainfo();
+                        }
+#endif
                         current->status.store( SUCCEEDED, std::memory_order_release);
                     }
                     break;
@@ -1010,6 +1225,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) {
+            key_matching_FE_operation op_data(&out, try_make, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
         void tuple_accepted() {
             reset_port_count();  // reset current_key after ports reset.
         }
@@ -1044,7 +1267,7 @@
         };
         typedef join_node_base<JP,InputTuple,OutputTuple> class_type;
 
-        class join_node_base_operation : public aggregated_operation<join_node_base_operation> {
+        class join_node_base_operation : public d1::aggregated_operation<join_node_base_operation> {
         public:
             char type;
             union {
@@ -1052,17 +1275,25 @@
                 successor_type *my_succ;
             };
             graph_task* bypass_t;
-            join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
-                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo* metainfo;
+#endif
+            join_node_base_operation(const output_type& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info))
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr)
+                  __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            join_node_base_operation(const output_type& e, op_type t)
+                : type(char(t)), my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr), metainfo(nullptr) {}
+#endif
             join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
             join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
         };
 
-        typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
-        friend class aggregating_functor<class_type, join_node_base_operation>;
+        typedef d1::aggregating_functor<class_type, join_node_base_operation> handler_type;
+        friend class d1::aggregating_functor<class_type, join_node_base_operation>;
         bool forwarder_busy;
-        aggregator<handler_type, join_node_base_operation> my_aggregator;
+        d1::aggregator<handler_type, join_node_base_operation> my_aggregator;
 
         void handle_operations(join_node_base_operation* op_list) {
             join_node_base_operation *current;
@@ -1073,10 +1304,9 @@
                 case reg_succ: {
                         my_successors.register_successor(*(current->my_succ));
                         if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) {
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             typedef forward_task_bypass< join_node_base<JP, InputTuple, OutputTuple> > task_type;
                             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(my_graph, *t);
                             forwarder_busy = true;
                         }
@@ -1089,7 +1319,26 @@
                     break;
                 case try__get:
                     if(tuple_build_may_succeed()) {
-                        if(try_to_make_tuple(*(current->my_arg))) {
+                        bool make_tuple_result = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                        if (current->metainfo) {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg), *(current->metainfo));
+                        } else
+#endif
+                        {
+                            make_tuple_result = try_to_make_tuple(*(current->my_arg));
+                        }
+                        if(make_tuple_result) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                            if (current->metainfo) {
+                                // Since elements would be removed from queues while calling to tuple_accepted
+                                // together with corresponding message_metainfo objects
+                                // we need to prolong the wait until the successor would create a task for removed elements
+                                for (auto waiter : current->metainfo->waiters()) {
+                                    waiter->reserve(1);
+                                }
+                            }
+#endif
                             tuple_accepted();
                             current->status.store( SUCCEEDED, std::memory_order_release);
                         }
@@ -1110,9 +1359,14 @@
                         // them from the input ports after forwarding is complete?
                         if(tuple_build_may_succeed()) {  // checks output queue of FE
                             do {
-                                build_succeeded = try_to_make_tuple(out);  // fetch front_end of queue
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                                message_metainfo metainfo;
+#endif
+                                // fetch front_end of queue
+                                build_succeeded = try_to_make_tuple(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                 if(build_succeeded) {
-                                    graph_task *new_task = my_successors.try_put_task(out);
+                                    graph_task *new_task =
+                                        my_successors.try_put_task(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
                                     last_task = combine_tasks(my_graph, last_task, new_task);
                                     if(new_task) {
                                         tuple_accepted();
@@ -1175,6 +1429,14 @@
             return op_data.status == SUCCEEDED;
         }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool try_get( output_type &v, message_metainfo& metainfo) override {
+            join_node_base_operation op_data(v, try__get, metainfo);
+            my_aggregator.execute(&op_data);
+            return op_data.status == SUCCEEDED;
+        }
+#endif
+
     protected:
         void reset_node(reset_flags f) override {
             input_ports_type::reset(f);
diff --git a/include/oneapi/tbb/detail/_flow_graph_node_impl.h b/include/oneapi/tbb/detail/_flow_graph_node_impl.h
index b79c53ddbf..336cb069c6 100644
--- a/include/oneapi/tbb/detail/_flow_graph_node_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_node_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -34,6 +34,12 @@ class function_input_queue : public item_buffer<T,A> {
         return this->item_buffer<T, A>::front();
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    const message_metainfo& front_metainfo() const {
+        return this->item_buffer<T,A>::front_metainfo();
+    }
+#endif
+
     void pop() {
         this->destroy_front();
     }
@@ -41,6 +47,12 @@ class function_input_queue : public item_buffer<T,A> {
     bool push( T& t ) {
         return this->push_back( t );
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool push( T& t, const message_metainfo& metainfo ) {
+        return this->push_back(t, metainfo);
+    }
+#endif
 };
 
 //! Input and scheduling for a function node that takes a type Input as input
@@ -87,11 +99,14 @@ class function_input_base : public receiver<Input>, no_assign {
     }
 
     graph_task* try_put_task( const input_type& t) override {
-        if ( my_is_no_throw )
-            return try_put_task_impl(t, has_policy<lightweight, Policy>());
-        else
-            return try_put_task_impl(t, std::false_type());
+        return try_put_task_base(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& t, const message_metainfo& metainfo ) override {
+        return try_put_task_base(t, metainfo);
     }
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
     //! Adds src to the list of cached predecessors.
     bool register_predecessor( predecessor_type &src ) override {
@@ -148,9 +163,12 @@ class function_input_base : public receiver<Input>, no_assign {
 private:
 
     friend class apply_body_task_bypass< class_type, input_type >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, input_type, trackable_messages_graph_task >;
+#endif
     friend class forward_task_bypass< class_type >;
 
-    class operation_type : public aggregated_operation< operation_type > {
+    class operation_type : public d1::aggregated_operation< operation_type > {
     public:
         char type;
         union {
@@ -158,31 +176,49 @@ class function_input_base : public receiver<Input>, no_assign {
             predecessor_type *r;
         };
         graph_task* bypass_t;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo;
+#endif
         operation_type(const input_type& e, op_type t) :
-            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr) {}
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr)
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            , metainfo(nullptr)
+#endif
+        {}
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        operation_type(const input_type& e, op_type t, const message_metainfo& info) :
+            type(char(t)), elem(const_cast<input_type*>(&e)), bypass_t(nullptr),
+            metainfo(const_cast<message_metainfo*>(&info)) {}
+#endif
         operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, operation_type> handler_type;
-    friend class aggregating_functor<class_type, operation_type>;
-    aggregator< handler_type, operation_type > my_aggregator;
+    typedef d1::aggregating_functor<class_type, operation_type> handler_type;
+    friend class d1::aggregating_functor<class_type, operation_type>;
+    d1::aggregator< handler_type, operation_type > my_aggregator;
 
     graph_task* perform_queued_requests() {
         graph_task* new_task = nullptr;
         if(my_queue) {
             if(!my_queue->empty()) {
                 ++my_concurrency;
-                new_task = create_body_task(my_queue->front());
+                // TODO: consider removing metainfo from the queue using move semantics to avoid
+                // ref counter increase
+                new_task = create_body_task(my_queue->front()
+                                            __TBB_FLOW_GRAPH_METAINFO_ARG(my_queue->front_metainfo()));
 
                 my_queue->pop();
             }
         }
         else {
             input_type i;
-            if(my_predecessors.get_item(i)) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            if(my_predecessors.get_item(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) {
                 ++my_concurrency;
-                new_task = create_body_task(i);
+                new_task = create_body_task(i __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
             }
         }
         return new_task;
@@ -233,10 +269,13 @@ class function_input_base : public receiver<Input>, no_assign {
         __TBB_ASSERT(my_max_concurrency != 0, nullptr);
         if (my_concurrency < my_max_concurrency) {
             ++my_concurrency;
-            graph_task * new_task = create_body_task(*(op->elem));
+            graph_task* new_task = create_body_task(*(op->elem)
+                                                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo)));
             op->bypass_t = new_task;
             op->status.store(SUCCEEDED, std::memory_order_release);
-        } else if ( my_queue && my_queue->push(*(op->elem)) ) {
+        } else if ( my_queue && my_queue->push(*(op->elem)
+                    __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo))) )
+        {
             op->bypass_t = SUCCESSFULLY_ENQUEUED;
             op->status.store(SUCCEEDED, std::memory_order_release);
         } else {
@@ -258,8 +297,10 @@ class function_input_base : public receiver<Input>, no_assign {
         }
     }
 
-    graph_task* internal_try_put_bypass( const input_type& t ) {
-        operation_type op_data(t, tryput_bypass);
+    graph_task* internal_try_put_bypass( const input_type& t
+                                         __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        operation_type op_data(t, tryput_bypass __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         if( op_data.status == SUCCEEDED ) {
             return op_data.bypass_t;
@@ -267,43 +308,75 @@ class function_input_base : public receiver<Input>, no_assign {
         return nullptr;
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type ) {
+    graph_task* try_put_task_base(const input_type& t
+                                  __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
+        if ( my_is_no_throw )
+            return try_put_task_impl(t, has_policy<lightweight, Policy>()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        else
+            return try_put_task_impl(t, std::false_type()
+                                     __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+    }
+
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return apply_body_bypass(t);
+            return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
             operation_type check_op(t, occupy_concurrency);
             my_aggregator.execute(&check_op);
             if( check_op.status == SUCCEEDED ) {
-                return apply_body_bypass(t);
+                return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             }
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
-    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type ) {
+    graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         if( my_max_concurrency == 0 ) {
-            return create_body_task(t);
+            return create_body_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         } else {
-            return internal_try_put_bypass(t);
+            return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         }
     }
 
     //! Applies the body to the provided input
     //  then decides if more work is available
-    graph_task* apply_body_bypass( const input_type &i ) {
-        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i);
+    graph_task* apply_body_bypass( const input_type &i
+                                   __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+
+    {
+        return static_cast<ImplType *>(this)->apply_body_impl_bypass(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
     }
 
     //! allocates a task to apply a body
-    graph_task* create_body_task( const input_type &input ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename Metainfo>
+    graph_task* create_body_task( const input_type &input, Metainfo&& metainfo )
+#else
+    graph_task* create_body_task( const input_type &input )
+#endif
+    {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
         // TODO revamp: extract helper for common graph task allocation part
-        small_object_allocator allocator{};
-        typedef apply_body_task_bypass<class_type, input_type> task_type;
-        graph_task* t = allocator.new_object<task_type>( my_graph_ref, allocator, *this, input, my_priority );
-        graph_reference().reserve_wait();
+        d1::small_object_allocator allocator{};
+        graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (!metainfo.empty()) {
+            using task_type = apply_body_task_bypass<class_type, input_type, trackable_messages_graph_task>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority, std::forward<Metainfo>(metainfo));
+        } else
+#endif
+        {
+            using task_type = apply_body_task_bypass<class_type, input_type>;
+            t = allocator.new_object<task_type>(my_graph_ref, allocator, *this, input, my_priority);
+        }
         return t;
     }
 
@@ -327,10 +400,9 @@ class function_input_base : public receiver<Input>, no_assign {
         if (!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef forward_task_bypass<class_type> task_type;
         graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, my_priority );
-        graph_reference().reserve_wait();
         return t;
     }
 
@@ -398,7 +470,9 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
     }
 
     //TODO: consider moving into the base class
-    graph_task* apply_body_impl_bypass( const input_type &i) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo))
+    {
         output_type v = apply_body_impl(i);
         graph_task* postponed_task = nullptr;
         if( base_type::my_max_concurrency != 0 ) {
@@ -410,7 +484,7 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
             // execution policy
             spawn_in_graph_arena(base_type::graph_reference(), *postponed_task);
         }
-        graph_task* successor_task = successors().try_put_task(v);
+        graph_task* successor_task = successors().try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (push)
 #pragma warning (disable: 4127)  /* suppress conditional expression is constant */
@@ -524,7 +598,9 @@ class multifunction_input : public function_input_base<Input, Policy, A, multifu
     // for multifunction nodes we do not have a single successor as such.  So we just tell
     // the task we were successful.
     //TODO: consider moving common parts with implementation in function_input into separate function
-    graph_task* apply_body_impl_bypass( const input_type &i ) {
+    graph_task* apply_body_impl_bypass( const input_type &i
+                                        __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&) )
+    {
         fgt_begin_body( my_body );
         (*my_body)(i, my_output_ports);
         fgt_end_body( my_body );
@@ -578,6 +654,18 @@ struct emit_element {
         check_task_and_spawn(g, last_task);
         return emit_element<N-1>::emit_this(g,t,p);
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& p,
+                                 const message_metainfo& metainfo)
+    {
+        // TODO: consider to collect all the tasks in task_list and spawn them all at once
+        graph_task* last_task = std::get<N-1>(p).try_put_task(std::get<N-1>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return emit_element<N-1>::emit_this(g, t, p, metainfo);
+    }
+#endif
 };
 
 template<>
@@ -588,6 +676,17 @@ struct emit_element<1> {
         check_task_and_spawn(g, last_task);
         return SUCCESSFULLY_ENQUEUED;
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    template <typename TupleType, typename PortsType>
+    static graph_task* emit_this(graph& g, const TupleType& t, PortsType& ports,
+                                 const message_metainfo& metainfo)
+    {
+        graph_task* last_task = std::get<0>(ports).try_put_task(std::get<0>(t), metainfo);
+        check_task_and_spawn(g, last_task);
+        return SUCCESSFULLY_ENQUEUED;
+    }
+#endif
 };
 
 //! Implements methods for an executable node that takes continue_msg as input
@@ -654,18 +753,25 @@ class continue_input : public continue_receiver {
     virtual broadcast_cache<output_type > &successors() = 0;
 
     friend class apply_body_task_bypass< class_type, continue_msg >;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    friend class apply_body_task_bypass< class_type, continue_msg, trackable_messages_graph_task >;
+#endif
 
     //! Applies the body to the provided input
-    graph_task* apply_body_bypass( input_type ) {
+    graph_task* apply_body_bypass( input_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         // There is an extra copied needed to capture the
         // body execution without the try_put
         fgt_begin_body( my_body );
         output_type v = (*my_body)( continue_msg() );
         fgt_end_body( my_body );
-        return successors().try_put_task( v );
+        return successors().try_put_task( v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* execute(const message_metainfo& metainfo) override {
+#else
     graph_task* execute() override {
+#endif
         if(!is_graph_active(my_graph_ref)) {
             return nullptr;
         }
@@ -677,13 +783,21 @@ class continue_input : public continue_receiver {
 #if _MSC_VER && !__INTEL_COMPILER
 #pragma warning (pop)
 #endif
-            return apply_body_bypass( continue_msg() );
+            return apply_body_bypass( continue_msg() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) );
         }
         else {
-            small_object_allocator allocator{};
-            typedef apply_body_task_bypass<class_type, continue_msg> task_type;
-            graph_task* t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
-            graph_reference().reserve_wait();
+            d1::small_object_allocator allocator{};
+            graph_task* t = nullptr;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (!metainfo.empty()) {
+                using task_type = apply_body_task_bypass<class_type, continue_msg, trackable_messages_graph_task>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority, metainfo );
+            } else
+#endif
+            {
+                using task_type = apply_body_task_bypass<class_type, continue_msg>;
+                t = allocator.new_object<task_type>( graph_reference(), allocator, *this, continue_msg(), my_priority );
+            }
             return t;
         }
     }
@@ -755,6 +869,12 @@ class multifunction_output : public function_output<Output> {
         return my_successors.try_put_task(i);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const output_type& i, const message_metainfo& metainfo) {
+        return my_successors.try_put_task(i, metainfo);
+    }
+#endif
+
     template <int N> friend struct emit_element;
 
 };  // multifunction_output
diff --git a/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h b/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
index ce867121f9..8440bd7008 100644
--- a/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2021 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// Included in namespace tbb::detail::d1 (in flow_graph.h)
+// Included in namespace tbb::detail::d2 (in flow_graph.h)
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 // Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get
diff --git a/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h b/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
index 8c20993795..47ecfb2a84 100644
--- a/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
+++ b/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template <typename Input, typename Output>
 struct declare_body_types {
@@ -51,10 +51,10 @@ template <typename T, typename Input, typename Output>
 struct body_types<Output (T::*)(Input&)> : declare_body_types<Input, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&) const> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&) const> : declare_body_types<NoInputBody, Output> {};
 
 template <typename T, typename Output>
-struct body_types<Output (T::*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (T::*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Input, typename Output>
 struct body_types<Output (*)(Input&)> : declare_body_types<Input, Output> {};
@@ -63,7 +63,7 @@ template <typename Input, typename Output>
 struct body_types<Output (*)(const Input&)> : declare_body_types<Input, Output> {};
 
 template <typename Output>
-struct body_types<Output (*)(flow_control&)> : declare_body_types<NoInputBody, Output> {};
+struct body_types<Output (*)(d1::flow_control&)> : declare_body_types<NoInputBody, Output> {};
 
 template <typename Body>
 using input_t = typename body_types<Body>::input_type;
@@ -100,7 +100,7 @@ decltype(decide_on_operator_overload(std::declval<Body>())) decide_on_callable_t
 template <typename GraphOrSet, typename Body>
 input_node(GraphOrSet&&, Body)
 ->input_node<output_t<decltype(decide_on_callable_type<Body>(0))>>;
-    
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 
 template <typename NodeSet>
@@ -268,7 +268,7 @@ template <typename NodeSet>
 write_once_node(const NodeSet&)
 ->write_once_node<decide_on_set_t<NodeSet>>;
 #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h b/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
index 0d9de17654..0f7c0d174f 100644
--- a/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,32 +30,88 @@
 
 // elements in the table are a simple list; we need pointer to next element to
 // traverse the chain
-template<typename ValueType>
-struct buffer_element_type {
-    // the second parameter below is void * because we can't forward-declare the type
-    // itself, so we just reinterpret_cast below.
-    typedef typename aligned_pair<ValueType, void *>::type type;
+
+template <typename Key, typename ValueType>
+struct hash_buffer_element : public aligned_pair<ValueType, void*> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    hash_buffer_element* get_next() { return reinterpret_cast<hash_buffer_element*>(this->second); }
+    void set_next(hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+
+    void create_element(const value_type& v) {
+        ::new(this->first) value_type(v);
+    }
+
+    void create_element(hash_buffer_element&& other) {
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+    }
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename Key, typename ValueType>
+struct metainfo_hash_buffer_element : public aligned_triple<ValueType, void*, message_metainfo> {
+    using key_type = Key;
+    using value_type = ValueType;
+
+    value_type* get_value_ptr() { return reinterpret_cast<value_type*>(this->first); }
+    metainfo_hash_buffer_element* get_next() {
+        return reinterpret_cast<metainfo_hash_buffer_element*>(this->second);
+    }
+    void set_next(metainfo_hash_buffer_element* new_next) { this->second = reinterpret_cast<void*>(new_next); }
+    message_metainfo& get_metainfo() { return this->third; }
+
+    void create_element(const value_type& v, const message_metainfo& metainfo) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(v);
+        this->third = metainfo;
+
+        for (auto waiter : metainfo.waiters()) {
+            waiter->reserve(1);
+        }
+    }
+
+    void create_element(metainfo_hash_buffer_element&& other) {
+        __TBB_ASSERT(this->third.empty(), nullptr);
+        ::new(this->first) value_type(std::move(*other.get_value_ptr()));
+        this->third = std::move(other.get_metainfo());
+    }
+
+    void destroy_element() {
+        get_value_ptr()->~value_type();
+
+        for (auto waiter : get_metainfo().waiters()) {
+            waiter->release(1);
+        }
+        get_metainfo() = message_metainfo{};
+    }
 };
+#endif
 
 template
     <
-     typename Key,         // type of key within ValueType
-     typename ValueType,
+     typename ElementType,
      typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
      typename HashCompare, // has hash and equal
-     typename Allocator=tbb::cache_aligned_allocator< typename aligned_pair<ValueType, void *>::type >
+     typename Allocator=tbb::cache_aligned_allocator<ElementType>
     >
-class hash_buffer : public HashCompare {
+class hash_buffer_impl : public HashCompare {
 public:
     static const size_t INITIAL_SIZE = 8;  // initial size of the hash pointer table
-    typedef ValueType value_type;
-    typedef typename buffer_element_type< value_type >::type element_type;
+    typedef typename ElementType::key_type key_type;
+    typedef typename ElementType::value_type value_type;
+    typedef ElementType element_type;
     typedef value_type *pointer_type;
     typedef element_type *list_array_type;  // array we manage manually
     typedef list_array_type *pointer_array_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<list_array_type> pointer_array_allocator_type;
     typedef typename std::allocator_traits<Allocator>::template rebind_alloc<element_type> elements_array_allocator;
-    typedef typename std::decay<Key>::type Knoref;
+    typedef typename std::decay<key_type>::type Knoref;
 
 private:
     ValueToKey *my_key;
@@ -69,9 +125,9 @@ class hash_buffer : public HashCompare {
 
     void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) {
         for(size_t i=0; i < sz - 1; ++i ) {  // construct free list
-            la[i].second = &(la[i+1]);
+            la[i].set_next(&(la[i + 1]));
         }
-        la[sz-1].second = nullptr;
+        la[sz - 1].set_next(nullptr);
         *p_free_list = (element_type *)&(la[0]);
     }
 
@@ -101,15 +157,18 @@ class hash_buffer : public HashCompare {
         {
             DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size);
             new_elements_array = elements_array_allocator().allocate(my_size);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < my_size; ++i) {
+                ::new(new_elements_array + i) element_type();
+            }
+#endif
             new_pointer_array = pointer_array_allocator_type().allocate(new_size);
             for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr;
             set_up_free_list(&new_free_list, new_elements_array, my_size );
 
             for(size_t i=0; i < my_size; ++i) {
-                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->second)) {
-                    value_type *ov = reinterpret_cast<value_type *>(&(op->first));
-                    // could have std::move semantics
-                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, *ov);
+                for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->get_next())) {
+                    internal_insert_with_key(new_pointer_array, new_size, new_free_list, std::move(*op));
                 }
             }
             my_cleanup.my_pa = nullptr;
@@ -126,15 +185,26 @@ class hash_buffer : public HashCompare {
 
     // v should have perfect forwarding if std::move implemented.
     // we use this method to move elements in grow_array, so can't use class fields
+    template <typename Value, typename... Args>
+    const value_type& get_value_from_pack(const Value& value, const Args&...) {
+        return value;
+    }
+
+    template <typename Element>
+    const value_type& get_value_from_pack(Element&& element) {
+        return *(element.get_value_ptr());
+    }
+
+    template <typename... Args>
     void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list,
-            const value_type &v) {
+                                   Args&&... args) {
         size_t l_mask = p_sz-1;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        size_t h = this->hash(tbb::detail::invoke(*my_key, v)) & l_mask;
+        size_t h = this->hash(tbb::detail::invoke(*my_key, get_value_from_pack(args...))) & l_mask;
         __TBB_ASSERT(p_free_list, "Error: free list not set up.");
-        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->second);
-        (void) new(&(my_elem->first)) value_type(v);
-        my_elem->second = p_pointer_array[h];
+        element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->get_next());
+        my_elem->create_element(std::forward<Args>(args)...);
+        my_elem->set_next(p_pointer_array[h]);
         p_pointer_array[h] = my_elem;
     }
 
@@ -142,6 +212,11 @@ class hash_buffer : public HashCompare {
         pointer_array = pointer_array_allocator_type().allocate(my_size);
         for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr;
         elements_array = elements_array_allocator().allocate(my_size / 2);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (std::size_t i = 0; i < my_size / 2; ++i) {
+            ::new(elements_array + i) element_type();
+        }
+#endif
         set_up_free_list(&free_list, elements_array, my_size / 2);
     }
 
@@ -151,13 +226,8 @@ class hash_buffer : public HashCompare {
             for(size_t i = 0; i < sz; ++i ) {
                 element_type *p_next;
                 for( element_type *p = pa[i]; p; p = p_next) {
-                    p_next = (element_type *)p->second;
-                    // TODO revamp: make sure type casting is correct.
-                    void* ptr = (void*)(p->first);
-#if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER
-                    suppress_unused_warning(ptr);
-#endif
-                    ((value_type*)ptr)->~value_type();
+                    p_next = p->get_next();
+                    p->destroy_element();
                 }
             }
             pointer_array_allocator_type().deallocate(pa, sz);
@@ -166,6 +236,11 @@ class hash_buffer : public HashCompare {
         // Separate test (if allocation of pa throws, el may be allocated.
         // but no elements will be constructed.)
         if(el) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            for (std::size_t i = 0; i < sz / 2; ++i) {
+                (el + i)->~element_type();
+            }
+#endif
             elements_array_allocator().deallocate(el, sz / 2);
             el = nullptr;
         }
@@ -174,17 +249,17 @@ class hash_buffer : public HashCompare {
     }
 
 public:
-    hash_buffer() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
+    hash_buffer_impl() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) {
         internal_initialize_buffer();
     }
 
-    ~hash_buffer() {
+    ~hash_buffer_impl() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
         delete my_key;
         my_key = nullptr;
     }
-    hash_buffer(const hash_buffer&) = delete;
-    hash_buffer& operator=(const hash_buffer&) = delete;
+    hash_buffer_impl(const hash_buffer_impl&) = delete;
+    hash_buffer_impl& operator=(const hash_buffer_impl&) = delete;
 
     void reset() {
         internal_free_buffer(pointer_array, elements_array, my_size, nelements);
@@ -197,34 +272,41 @@ class hash_buffer : public HashCompare {
     // pointer is used to clone()
     ValueToKey* get_key_func() { return my_key; }
 
-    bool insert_with_key(const value_type &v) {
-        pointer_type p = nullptr;
+    template <typename... Args>
+    bool insert_with_key(const value_type &v, Args&&... args) {
+        element_type* p = nullptr;
         __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-        if(find_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
-            p->~value_type();
-            (void) new(p) value_type(v);  // copy-construct into the space
+        if(find_element_ref_with_key(tbb::detail::invoke(*my_key, v), p)) {
+            p->destroy_element();
+            p->create_element(v, std::forward<Args>(args)...);
             return false;
         }
         ++nelements;
         if(nelements*2 > my_size) grow_array();
-        internal_insert_with_key(pointer_array, my_size, free_list, v);
+        internal_insert_with_key(pointer_array, my_size, free_list, v, std::forward<Args>(args)...);
         return true;
     }
 
-    // returns true and sets v to array element if found, else returns false.
-    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+    bool find_element_ref_with_key(const Knoref& k, element_type*& v) {
         size_t i = this->hash(k) & mask();
-        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->second)) {
-            pointer_type pv = reinterpret_cast<pointer_type>(&(p->first));
+        for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->get_next())) {
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
-            if(this->equal(tbb::detail::invoke(*my_key, *pv), k)) {
-                v = pv;
+            if(this->equal(tbb::detail::invoke(*my_key, *p->get_value_ptr()), k)) {
+                v = p;
                 return true;
             }
         }
         return false;
     }
 
+    // returns true and sets v to array element if found, else returns false.
+    bool find_ref_with_key(const Knoref& k, pointer_type &v) {
+        element_type* element_ptr = nullptr;
+        bool res = find_element_ref_with_key(k, element_ptr);
+        v = element_ptr->get_value_ptr();
+        return res;
+    }
+
     bool find_with_key( const Knoref& k, value_type &v) {
         value_type *p;
         if(find_ref_with_key(k, p)) {
@@ -238,14 +320,14 @@ class hash_buffer : public HashCompare {
     void delete_with_key(const Knoref& k) {
         size_t h = this->hash(k) & mask();
         element_type* prev = nullptr;
-        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->second)) {
-            value_type *vp = reinterpret_cast<value_type *>(&(p->first));
+        for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->get_next())) {
+            value_type *vp = p->get_value_ptr();
             __TBB_ASSERT(my_key, "Error: value-to-key functor not provided");
             if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) {
-                vp->~value_type();
-                if(prev) prev->second = p->second;
-                else pointer_array[h] = (element_type *)(p->second);
-                p->second = free_list;
+                p->destroy_element();
+                if(prev) prev->set_next(p->get_next());
+                else pointer_array[h] = (element_type *)(p->get_next());
+                p->set_next(free_list);
                 free_list = p;
                 --nelements;
                 return;
@@ -254,4 +336,45 @@ class hash_buffer : public HashCompare {
         __TBB_ASSERT(false, "key not found for delete");
     }
 };
+
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<hash_buffer_element<Key, ValueType>>
+    >
+using hash_buffer = hash_buffer_impl<hash_buffer_element<Key, ValueType>,
+                                     ValueToKey, HashCompare, Allocator>;
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template
+    <
+     typename Key,         // type of key within ValueType
+     typename ValueType,
+     typename ValueToKey,  // abstract method that returns "const Key" or "const Key&" given ValueType
+     typename HashCompare, // has hash and equal
+     typename Allocator=tbb::cache_aligned_allocator<metainfo_hash_buffer_element<Key, ValueType>>
+    >
+struct metainfo_hash_buffer : public hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                               ValueToKey, HashCompare, Allocator>
+{
+private:
+    using base_type = hash_buffer_impl<metainfo_hash_buffer_element<Key, ValueType>,
+                                       ValueToKey, HashCompare, Allocator>;
+public:
+    bool find_with_key(const typename base_type::Knoref& k,
+                       typename base_type::value_type& v, message_metainfo& metainfo)
+    {
+        typename base_type::element_type* p = nullptr;
+        bool result = this->find_element_ref_with_key(k, p);
+        if (result) {
+            v = *(p->get_value_ptr());
+            metainfo = p->get_metainfo();
+        }
+        return result;
+    }
+};
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 #endif // __TBB__flow_graph_hash_buffer_impl_H
diff --git a/include/oneapi/tbb/detail/_flow_graph_trace_impl.h b/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
index a161dd0362..74ebf08456 100644
--- a/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_trace_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 template< typename T > class sender;
 template< typename T > class receiver;
@@ -44,29 +44,29 @@ template< typename T > class receiver;
 
 static inline void fgt_alias_port(void *node, void *p, bool visible) {
     if(visible)
-        itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE );
     else
-        itt_relation_add( ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
+        itt_relation_add( d1::ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 static inline void fgt_composite ( void* codeptr, void *node, void *graph ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT );
 }
 
 static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
-    itt_relation_add( ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT );
 }
 
 template<typename InputType>
@@ -109,15 +109,15 @@ struct fgt_internal_output_alias_helper<PortsTuple, 0> {
 };
 
 static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index );
 }
 
 static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) {
-    itt_make_task_group(ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
+    itt_make_task_group(d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index);
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
@@ -167,40 +167,40 @@ struct fgt_internal_output_helper<PortsTuple,1> {
 template< typename NodeType >
 void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  const_cast<NodeType *>(node);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 template< typename NodeType >
 static inline void fgt_node_desc( const NodeType *node, const char *desc ) {
     void *addr =  (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) );
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_graph_desc( const void *g, const char *desc ) {
     void *addr = const_cast< void *>(g);
-    itt_metadata_str_add( ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
+    itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc );
 }
 
 static inline void fgt_body( void *node, void *body ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port(codeptr, input_port, ports );
 }
 
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 );
     fgt_internal_output_helper<PortsTuple, N>::register_port( codeptr, input_port, ports );
     fgt_body( input_port, body );
@@ -208,28 +208,28 @@ static inline void fgt_multioutput_node_with_body( void* codeptr, string_resourc
 
 template< int N, typename PortsTuple >
 static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_internal_input_helper<PortsTuple, N>::register_port( output_port, ports );
 }
 
 static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t );
     suppress_unused_warning( codeptr );
 #if __TBB_FLOW_TRACE_CODEPTR
     if (codeptr != nullptr) {
-        register_node_addr(ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
+        register_node_addr(d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr);
     }
 #endif
 }
 
 static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
 }
 
 static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t );
     fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 );
     fgt_body( output_port, body );
 }
@@ -251,47 +251,47 @@ static inline void  fgt_node( void* codeptr, string_resource_index t, void *g, v
 }
 
 static inline void fgt_make_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_remove_edge( void *output_port, void *input_port ) {
-    itt_relation_add( ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
+    itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT);
 }
 
 static inline void fgt_graph( void *g ) {
-    itt_make_task_group( ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
+    itt_make_task_group( d1::ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH );
 }
 
 static inline void fgt_begin_body( void *body ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY );
 }
 
 static inline void fgt_end_body( void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_try_put_begin( void *node, void *port ) {
-    itt_task_begin( ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
+    itt_task_begin( d1::ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT );
 }
 
 static inline void fgt_async_try_put_end( void *, void * ) {
-    itt_task_end( ITT_DOMAIN_FLOW );
+    itt_task_end( d1::ITT_DOMAIN_FLOW );
 }
 
 static inline void fgt_async_reserve( void *node, void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL );
 }
 
 static inline void fgt_async_commit( void *node, void * /*graph*/) {
-    itt_region_end( ITT_DOMAIN_FLOW, node, FLOW_NODE );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE );
 }
 
 static inline void fgt_reserve_wait( void *graph ) {
-    itt_region_begin( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
+    itt_region_begin( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL );
 }
 
 static inline void fgt_release_wait( void *graph ) {
-    itt_region_end( ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
+    itt_region_end( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH );
 }
 
 #else // TBB_USE_PROFILING_TOOLS
@@ -357,7 +357,7 @@ struct fgt_internal_output_alias_helper {
 
 #endif // TBB_USE_PROFILING_TOOLS
 
-} // d1
+} // d2
 } // namespace detail
 } // namespace tbb
 
diff --git a/include/oneapi/tbb/detail/_flow_graph_types_impl.h b/include/oneapi/tbb/detail/_flow_graph_types_impl.h
index 4827551d85..e361b23e7b 100644
--- a/include/oneapi/tbb/detail/_flow_graph_types_impl.h
+++ b/include/oneapi/tbb/detail/_flow_graph_types_impl.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #error Do not #include this internal file directly; use public TBB headers instead.
 #endif
 
-// included in namespace tbb::detail::d1
+// included in namespace tbb::detail::d2
 
 // the change to key_matching (adding a K and KHash template parameter, making it a class)
 // means we have to pass this data to the key_matching_port.  All the ports have only one
@@ -73,40 +73,55 @@ struct make_sequence < 0, S... > {
     typedef sequence<S...> type;
 };
 
-//! type mimicking std::pair but with trailing fill to ensure each element of an array
-//* will have the correct alignment
-template<typename T1, typename T2, size_t REM>
-struct type_plus_align {
-    char first[sizeof(T1)];
-    T2 second;
-    char fill1[REM];
+template<class U> struct alignment_of {
+    typedef struct { char t; U    padded; } test_alignment;
+    static const size_t value = sizeof(test_alignment) - sizeof(U);
 };
 
-template<typename T1, typename T2>
-struct type_plus_align<T1,T2,0> {
-    char first[sizeof(T1)];
-    T2 second;
+template <typename... Types>
+struct max_alignment_helper;
+
+template <typename T1, typename... Types>
+struct max_alignment_helper<T1, Types...> {
+    using type = typename max_alignment_helper<T1, typename max_alignment_helper<Types...>::type>::type;
 };
 
-template<class U> struct alignment_of {
-    typedef struct { char t; U    padded; } test_alignment;
-    static const size_t value = sizeof(test_alignment) - sizeof(U);
+template <typename T1, typename T2>
+struct max_alignment_helper<T1, T2> {
+    using type = typename std::conditional<alignof(T1) < alignof(T2), T2, T1>::type;
 };
 
+template <typename... Types>
+using max_alignment_helper_t = typename max_alignment_helper<Types...>::type;
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(push)
+#pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
+#endif
+
 // T1, T2 are actual types stored.  The space defined for T1 in the type returned
 // is a char array of the correct size.  Type T2 should be trivially-constructible,
 // T1 must be explicitly managed.
-template<typename T1, typename T2>
-struct aligned_pair {
-    static const size_t t1_align = alignment_of<T1>::value;
-    static const size_t t2_align = alignment_of<T2>::value;
-    typedef type_plus_align<T1, T2, 0 > just_pair;
-    static const size_t max_align = t1_align < t2_align ? t2_align : t1_align;
-    static const size_t extra_bytes = sizeof(just_pair) % max_align;
-    static const size_t remainder = extra_bytes ? max_align - extra_bytes : 0;
-public:
-    typedef type_plus_align<T1,T2,remainder> type;
-};  // aligned_pair
+
+template <typename T1, typename T2>
+struct alignas(alignof(max_alignment_helper_t<T1, T2>)) aligned_pair {
+    char first[sizeof(T1)];
+    T2 second;
+};
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+template <typename T1, typename T2, typename T3>
+struct alignas(alignof(max_alignment_helper_t<T1, T2, T3>)) aligned_triple {
+    char first[sizeof(T1)];
+    T2 second;
+    T3 third;
+};
+#endif
+
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#pragma warning(pop) // warning 4324 is back
+#endif
 
 // support for variant type
 // type we use when we're not storing a value
diff --git a/include/oneapi/tbb/detail/_pipeline_filters.h b/include/oneapi/tbb/detail/_pipeline_filters.h
index 46e7b95d6c..8121946729 100644
--- a/include/oneapi/tbb/detail/_pipeline_filters.h
+++ b/include/oneapi/tbb/detail/_pipeline_filters.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -32,6 +32,12 @@ namespace d1 {
 class base_filter;
 }
 
+namespace d2 {
+template <typename Output>
+__TBB_requires(std::copyable<Output>)
+class input_node;
+}
+
 namespace r1 {
 TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&);
 class pipeline;
@@ -131,7 +137,7 @@ class flow_control {
     template<typename Body, typename InputType, typename OutputType > friend class concrete_filter;
     template<typename Output>
     __TBB_requires(std::copyable<Output>)
-    friend class input_node;
+    friend class d2::input_node;
 public:
     void stop() { is_pipeline_stopped = true; }
 };
diff --git a/include/oneapi/tbb/detail/_segment_table.h b/include/oneapi/tbb/detail/_segment_table.h
index 1a31d8a17d..7fbf1cc0f9 100644
--- a/include/oneapi/tbb/detail/_segment_table.h
+++ b/include/oneapi/tbb/detail/_segment_table.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -260,34 +260,49 @@ class segment_table {
         }
     }
 
+    void destroy_and_deallocate_table(segment_table_type table, size_type num_segments) {
+        auto& alloc = get_allocator();
+        for (size_type seg_idx = 0; seg_idx < num_segments; ++seg_idx) {
+            segment_table_allocator_traits::destroy(alloc, &table[seg_idx]);
+        }
+        segment_table_allocator_traits::deallocate(alloc, table, num_segments);
+    }
+
     void clear_table() {
         segment_table_type current_segment_table = get_table();
         if (current_segment_table != my_embedded_table) {
             // If the active table is not the embedded one - deallocate the active table
-            for (size_type i = 0; i != pointers_per_long_table; ++i) {
-                segment_table_allocator_traits::destroy(my_segment_table_allocator, &current_segment_table[i]);
-            }
-
-            segment_table_allocator_traits::deallocate(my_segment_table_allocator, current_segment_table, pointers_per_long_table);
+            destroy_and_deallocate_table(current_segment_table, pointers_per_long_table);
             my_segment_table.store(my_embedded_table, std::memory_order_relaxed);
             zero_table(my_embedded_table, pointers_per_embedded_table);
         }
     }
 
     void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) {
-        // extend_segment_table if an active table is an embedded table
-        // and the requested index is not in the embedded table
+        // Extend segment table if an active table is an embedded one and the requested index is
+        // outside it
         if (table == my_embedded_table && end_index > embedded_table_size) {
             if (start_index <= embedded_table_size) {
+                // More than one thread can get here: the one that has assigned the first block and
+                // is in the process of allocating it now, and the one that saw the first block has
+                // been assigned already, but not yet allocated. This latter thread decides not to
+                // wait for the first one and extend the table itself.
                 try_call([&] {
-                    table = self()->allocate_long_table(my_embedded_table, start_index);
-                    // It is possible that the table was extended by the thread that allocated first_block.
-                    // In this case it is necessary to re-read the current table.
-
-                    if (table) {
-                        my_segment_table.store(table, std::memory_order_release);
-                    } else {
-                        table = my_segment_table.load(std::memory_order_acquire);
+                    segment_table_type new_table =
+                        self()->allocate_long_table(my_embedded_table, start_index);
+                    // It is possible that the table was extended by the thread that allocated first
+                    // block. In this case, the below CAS fails and re-reads the new table pointer.
+                    if (my_segment_table.compare_exchange_strong(
+                            table, new_table,
+                            /*memory order in case of a success*/std::memory_order_release,
+                            /*memory order in case of a failure*/std::memory_order_acquire))
+                    {
+                        // CAS was successful, update the local table pointer with now actual
+                        table = new_table;
+                    } else if (new_table) {
+                        // Other thread was the first to replace the segment table. Current thread's
+                        // table is not needed anymore, so destroying it.
+                        destroy_and_deallocate_table(new_table, pointers_per_long_table);
                     }
                 }).on_exception([&] {
                     my_segment_table_allocation_failed.store(true, std::memory_order_relaxed);
diff --git a/include/oneapi/tbb/detail/_task.h b/include/oneapi/tbb/detail/_task.h
index 1fa75281c8..400f9cd41c 100644
--- a/include/oneapi/tbb/detail/_task.h
+++ b/include/oneapi/tbb/detail/_task.h
@@ -44,6 +44,7 @@ class wait_context;
 class task_group_context;
 struct execution_data;
 class wait_tree_vertex_interface;
+class task_arena_base;
 }
 
 namespace d2 {
@@ -58,6 +59,7 @@ TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& c
 TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx);
 TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx);
 TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*);
+TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base&);
 TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context();
 TBB_EXPORT d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* wc);
 
@@ -200,11 +202,9 @@ class reference_vertex : public wait_tree_vertex_interface {
     }
 
     void release(std::uint32_t delta = 1) override {
+        auto parent = my_parent;
         std::uint64_t ref = m_ref_count.fetch_sub(static_cast<std::uint64_t>(delta)) - static_cast<std::uint64_t>(delta);
         if (ref == 0) {
-            auto parent = my_parent;
-            execute_continuation();
-            destroy();
             parent->release();
         }
     }
@@ -212,12 +212,6 @@ class reference_vertex : public wait_tree_vertex_interface {
     std::uint32_t get_num_child() {
         return static_cast<std::uint32_t>(m_ref_count.load(std::memory_order_acquire));
     }
-
-protected:
-    virtual void execute_continuation() {}
-    virtual void destroy() {}
-    virtual void destroy(const d1::execution_data&) {}
-
 private:
     wait_tree_vertex_interface* my_parent;
     std::atomic<std::uint64_t> m_ref_count;
diff --git a/include/oneapi/tbb/detail/_template_helpers.h b/include/oneapi/tbb/detail/_template_helpers.h
index 50ce3d2d3b..a20c5af5c3 100644
--- a/include/oneapi/tbb/detail/_template_helpers.h
+++ b/include/oneapi/tbb/detail/_template_helpers.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -106,6 +106,10 @@ using make_index_sequence = typename make_index_sequence_impl<N>::type;
 
 #endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */
 
+//! Attach an index to a type to use it with an index sequence
+template<typename T, std::size_t>
+using indexed_t = T;
+
 #if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT
 using std::conjunction;
 using std::disjunction;
diff --git a/include/oneapi/tbb/flow_graph.h b/include/oneapi/tbb/flow_graph.h
index 2df4b14050..5b438faabf 100644
--- a/include/oneapi/tbb/flow_graph.h
+++ b/include/oneapi/tbb/flow_graph.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@
 
 #include <tuple>
 #include <list>
+#include <forward_list>
 #include <queue>
 #if __TBB_CPP20_CONCEPTS_PRESENT
 #include <concepts>
@@ -70,7 +71,7 @@
 namespace tbb {
 namespace detail {
 
-namespace d1 {
+namespace d2 {
 
 //! An enumeration the provides the two most common concurrency levels: unlimited and serial
 enum concurrency { unlimited = 0, serial = 1 };
@@ -81,19 +82,19 @@ struct null_type {};
 //! An empty class used for messages that mean "I'm done"
 class continue_msg {};
 
-} // namespace d1
+} // namespace d2
 
 #if __TBB_CPP20_CONCEPTS_PRESENT
 namespace d0 {
 
 template <typename ReturnType, typename OutputType>
-concept node_body_return_type = std::same_as<OutputType, tbb::detail::d1::continue_msg> ||
+concept node_body_return_type = std::same_as<OutputType, tbb::detail::d2::continue_msg> ||
                                 std::convertible_to<OutputType, ReturnType>;
 
 // TODO: consider using std::invocable here
 template <typename Body, typename Output>
 concept continue_node_body = std::copy_constructible<Body> &&
-                             requires( Body& body, const tbb::detail::d1::continue_msg& v ) {
+                             requires( Body& body, const tbb::detail::d2::continue_msg& v ) {
                                  { body(v) } -> node_body_return_type<Output>;
                              };
 
@@ -129,7 +130,7 @@ concept async_node_body = std::copy_constructible<Body> &&
 } // namespace d0
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
 
-namespace d1 {
+namespace d2 {
 
 //! Forward declaration section
 template< typename T > class sender;
@@ -153,7 +154,7 @@ template<typename Order, typename... Args> struct node_set;
 #endif
 
 
-} // namespace d1
+} // namespace d2
 } // namespace detail
 } // namespace tbb
 
@@ -162,7 +163,7 @@ template<typename Order, typename... Args> struct node_set;
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 static inline std::pair<graph_task*, graph_task*> order_tasks(graph_task* first, graph_task* second) {
     if (second->priority > first->priority)
@@ -187,6 +188,37 @@ static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task*
     return left;
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+class message_metainfo {
+public:
+    using waiters_type = std::forward_list<d1::wait_context_vertex*>;
+
+    message_metainfo() = default;
+
+    message_metainfo(const waiters_type& waiters) : my_waiters(waiters) {}
+    message_metainfo(waiters_type&& waiters) : my_waiters(std::move(waiters)) {}
+
+    const waiters_type& waiters() const & { return my_waiters; }
+    waiters_type&& waiters() && { return std::move(my_waiters); }
+
+    bool empty() const { return my_waiters.empty(); }
+
+    void merge(const message_metainfo& other) {
+        // TODO: should we avoid duplications on merging
+        my_waiters.insert_after(my_waiters.before_begin(),
+                                other.waiters().begin(),
+                                other.waiters().end());
+    }
+private:
+    waiters_type my_waiters;
+}; // class message_metainfo
+
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) , metainfo
+
+#else
+#define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Pure virtual template class that defines a sender of messages of type T
 template< typename T >
 class sender {
@@ -196,9 +228,17 @@ class sender {
     //! Request an item from the sender
     virtual bool try_get( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_get( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Reserves an item in the sender
     virtual bool try_reserve( T & ) { return false; }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual bool try_reserve( T &, message_metainfo& ) { return false; }
+#endif
+
     //! Releases the reserved item
     virtual bool try_release( ) { return false; }
 
@@ -238,17 +278,38 @@ bool remove_successor(sender<C>& s, receiver<C>& r) {
 //! Pure virtual template class that defines a receiver of messages of type T
 template< typename T >
 class receiver {
+private:
+    template <typename... TryPutTaskArgs>
+    bool internal_try_put(const T& t, TryPutTaskArgs&&... args) {
+        graph_task* res = try_put_task(t, std::forward<TryPutTaskArgs>(args)...);
+        if (!res) return false;
+        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
+        return true;
+    }
+
 public:
     //! Destructor
     virtual ~receiver() {}
 
     //! Put an item to the receiver
     bool try_put( const T& t ) {
-        graph_task *res = try_put_task(t);
-        if (!res) return false;
-        if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res);
-        return true;
+        return internal_try_put(t);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    //! Put an item to the receiver and wait for completion
+    bool try_put_and_wait( const T& t ) {
+        // Since try_put_and_wait is a blocking call, it is safe to create wait_context on stack
+        d1::wait_context_vertex msg_wait_vertex{};
+
+        bool res = internal_try_put(t, message_metainfo{message_metainfo::waiters_type{&msg_wait_vertex}});
+        if (res) {
+            __TBB_ASSERT(graph_reference().my_context != nullptr, "No wait_context associated with the Flow Graph");
+            d1::wait(msg_wait_vertex.get_context(), *graph_reference().my_context);
+        }
+        return res;
     }
+#endif
 
     //! put item to successor; return task to run the successor if possible.
 protected:
@@ -262,6 +323,9 @@ class receiver {
     template< typename X, typename Y > friend class broadcast_cache;
     template< typename X, typename Y > friend class round_robin_cache;
     virtual graph_task *try_put_task(const T& t) = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task *try_put_task(const T& t, const message_metainfo&) = 0;
+#endif
     virtual graph& graph_reference() const = 0;
 
     template<typename TT, typename M> friend class successor_cache;
@@ -337,23 +401,61 @@ class continue_receiver : public receiver< continue_msg > {
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     // execute body is supposed to be too small to create a task for.
-    graph_task* try_put_task( const input_type & ) override {
+    graph_task* try_put_task_impl( const input_type& __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo predecessor_metainfo;
+#endif
         {
             spin_mutex::scoped_lock l(my_mutex);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            // Prolong the wait and store the metainfo until receiving signals from all the predecessors
+            for (auto waiter : metainfo.waiters()) {
+                waiter->reserve(1);
+            }
+            my_current_metainfo.merge(metainfo);
+#endif
             if ( ++my_current_count < my_predecessor_count )
                 return SUCCESSFULLY_ENQUEUED;
-            else
+            else {
                 my_current_count = 0;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+                predecessor_metainfo = my_current_metainfo;
+                my_current_metainfo = message_metainfo{};
+#endif
+            }
+        }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        graph_task* res = execute(predecessor_metainfo);
+        for (auto waiter : predecessor_metainfo.waiters()) {
+            waiter->release(1);
         }
+#else
         graph_task* res = execute();
+#endif
         return res? res : SUCCESSFULLY_ENQUEUED;
     }
 
+protected:
+    graph_task* try_put_task( const input_type& input ) override {
+        return try_put_task_impl(input __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const input_type& input, const message_metainfo& metainfo ) override {
+        return try_put_task_impl(input, metainfo);
+    }
+#endif
+
     spin_mutex my_mutex;
     int my_predecessor_count;
     int my_current_count;
     int my_initial_predecessor_count;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_current_metainfo;
+#endif
     node_priority_t my_priority;
     // the friend declaration in the base class did not eliminate the "protected class"
     // error in gcc 4.1.2
@@ -369,7 +471,11 @@ class continue_receiver : public receiver< continue_msg > {
     //! Does whatever should happen when the threshold is reached
     /** This should be very fast or else spawn a task.  This is
         called while the sender is blocked in the try_put(). */
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    virtual graph_task* execute(const message_metainfo& metainfo) = 0;
+#else
     virtual graph_task* execute() = 0;
+#endif
     template<typename TT, typename M> friend class successor_cache;
     bool is_continue_receiver() override { return true; }
 
@@ -392,7 +498,7 @@ class continue_receiver : public receiver< continue_msg > {
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 #include "detail/_flow_graph_body_impl.h"
 #include "detail/_flow_graph_cache_impl.h"
@@ -424,7 +530,7 @@ void graph_iterator<C,N>::internal_forward() {
 }
 
 //! Constructs a graph with isolated task_group_context
-inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+inline graph::graph() : my_wait_context_vertex(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = true;
     cancelled = false;
@@ -435,7 +541,7 @@ inline graph::graph() : my_wait_context(0), my_nodes(nullptr), my_nodes_last(nul
 }
 
 inline graph::graph(task_group_context& use_this_context) :
-    my_wait_context(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
+    my_wait_context_vertex(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) {
     prepare_task_arena();
     own_context = false;
     cancelled = false;
@@ -454,13 +560,13 @@ inline graph::~graph() {
 }
 
 inline void graph::reserve_wait() {
-    my_wait_context.reserve();
+    my_wait_context_vertex.reserve();
     fgt_reserve_wait(this);
 }
 
 inline void graph::release_wait() {
     fgt_release_wait(this);
-    my_wait_context.release();
+    my_wait_context_vertex.release();
 }
 
 inline void graph::register_node(graph_node *n) {
@@ -633,6 +739,18 @@ class input_node : public graph_node, public sender< Output > {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve( output_type& v, message_metainfo& ) override {
+        return try_reserve(v);
+    }
+
+    bool try_get( output_type& v, message_metainfo& ) override {
+        return try_get(v);
+    }
+public:
+#endif
+
     //! Release a reserved item.
     /** true = item has been released and so remains in sender, dest must request or reserve future items */
     bool try_release( ) override {
@@ -703,7 +821,7 @@ class input_node : public graph_node, public sender< Output > {
             return false;
         }
         if ( !my_has_cached_item ) {
-            flow_control control;
+            d1::flow_control control;
 
             fgt_begin_body( my_body );
 
@@ -722,10 +840,9 @@ class input_node : public graph_node, public sender< Output > {
     }
 
     graph_task* create_put_task() {
-        small_object_allocator allocator{};
+        d1::small_object_allocator allocator{};
         typedef input_node_task_bypass< input_node<output_type> > task_type;
         graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-        my_graph.reserve_wait();
         return t;
     }
 
@@ -962,6 +1079,14 @@ class split_node : public graph_node, public receiver<TupleType> {
         // Also, we do not have successors here. So we just tell the task returned here is successful.
         return emit_element<N>::emit_this(this->my_graph, t, output_ports());
     }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const TupleType& t, const message_metainfo& metainfo) override {
+        // Sending split messages in parallel is not justified, as overheads would prevail.
+        // Also, we do not have successors here. So we just tell the task returned here is successful.
+        return emit_element<N>::emit_this(this->my_graph, t, output_ports(), metainfo);
+    }
+#endif
+
     void reset_node(reset_flags f) override {
         if (f & rf_clear_edges)
             clear_element<N>::clear_this(my_output_ports);
@@ -1119,17 +1244,28 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
         return true;
     }
 
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        graph_task* new_task = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
+        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
+        return new_task;
+    }
+
 protected:
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
     //! build a task to run the successor if possible.  Default is old behavior.
-    graph_task *try_put_task(const T& t) override {
-        graph_task *new_task = my_successors.try_put_task(t);
-        if (!new_task) new_task = SUCCESSFULLY_ENQUEUED;
-        return new_task;
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1168,24 +1304,37 @@ class buffer_node
     };
 
     // implements the aggregator_operation concept
-    class buffer_operation : public aggregated_operation< buffer_operation > {
+    class buffer_operation : public d1::aggregated_operation< buffer_operation > {
     public:
         char type;
         T* elem;
         graph_task* ltask;
         successor_type *r;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo* metainfo{ nullptr };
+#endif
 
         buffer_operation(const T& e, op_type t) : type(char(t))
                                                   , elem(const_cast<T*>(&e)) , ltask(nullptr)
                                                   , r(nullptr)
         {}
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        buffer_operation(const T& e, op_type t, const message_metainfo& info)
+            : type(char(t)), elem(const_cast<T*>(&e)), ltask(nullptr), r(nullptr)
+            , metainfo(const_cast<message_metainfo*>(&info))
+        {}
+
+        buffer_operation(op_type t, message_metainfo& info)
+            : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr), metainfo(&info) {}
+#endif
         buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {}
     };
 
     bool forwarder_busy;
-    typedef aggregating_functor<class_type, buffer_operation> handler_type;
-    friend class aggregating_functor<class_type, buffer_operation>;
-    aggregator< handler_type, buffer_operation> my_aggregator;
+    typedef d1::aggregating_functor<class_type, buffer_operation> handler_type;
+    friend class d1::aggregating_functor<class_type, buffer_operation>;
+    d1::aggregator< handler_type, buffer_operation> my_aggregator;
 
     virtual void handle_operations(buffer_operation *op_list) {
         handle_operations_impl(op_list, this);
@@ -1218,9 +1367,8 @@ class buffer_node
             if(is_graph_active(this->my_graph)) {
                 forwarder_busy = true;
                 typedef forward_task_bypass<class_type> task_type;
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 graph_task* new_task = allocator.new_object<task_type>(graph_reference(), allocator, *this);
-                my_graph.reserve_wait();
                 // tmp should point to the last item handled by the aggregator.  This is the operation
                 // the handling thread enqueued.  So modifying that record will be okay.
                 // TODO revamp: check that the issue is still present
@@ -1286,7 +1434,8 @@ class buffer_node
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = my_successors.try_put_task(this->back());
+        graph_task* new_task = my_successors.try_put_task(this->back()
+                                                          __TBB_FLOW_GRAPH_METAINFO_ARG(this->back_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& g = this->my_graph;
@@ -1328,14 +1477,25 @@ class buffer_node
 
     virtual bool internal_push(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        this->push_back(*(op->elem), (*op->metainfo));
+#else
         this->push_back(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
 
     virtual void internal_pop(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->pop_back(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool pop_result = op->metainfo ? this->pop_back(*(op->elem), *(op->metainfo))
+                                       : this->pop_back(*(op->elem));
+#else
+        bool pop_result = this->pop_back(*(op->elem));
+#endif
+        if (pop_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1345,7 +1505,13 @@ class buffer_node
 
     virtual void internal_reserve(buffer_operation *op) {
         __TBB_ASSERT(op->elem, nullptr);
-        if(this->reserve_front(*(op->elem))) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        bool reserve_result = op->metainfo ? this->reserve_front(*(op->elem), *(op->metainfo))
+                                           : this->reserve_front(*(op->elem));
+#else
+        bool reserve_result = this->reserve_front(*(op->elem));
+#endif
+        if (reserve_result) {
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
         else {
@@ -1403,7 +1569,7 @@ class buffer_node
         It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why full qualification is necessary here
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         buffer_operation op_data(rem_succ);
         op_data.r = &r;
         my_aggregator.execute(&op_data);
@@ -1425,6 +1591,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T &v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(req_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return (op_data.status==SUCCEEDED);
+    }
+#endif
+
     //! Reserves an item.
     /**  false = no item can be reserved<BR>
          true = an item is reserved */
@@ -1436,6 +1612,16 @@ class buffer_node
         return (op_data.status==SUCCEEDED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_reserve( output_type& v, message_metainfo& metainfo ) override {
+        buffer_operation op_data(res_item, metainfo);
+        op_data.elem = &v;
+        my_aggregator.execute(&op_data);
+        (void)enqueue_forwarding_task(op_data);
+        return op_data.status==SUCCEEDED;
+    }
+#endif
+
     //! Release a reserved item.
     /**  true = item has been released and so remains in sender */
     bool try_release() override {
@@ -1454,14 +1640,9 @@ class buffer_node
         return true;
     }
 
-protected:
-
-    template< typename R, typename B > friend class run_and_put_task;
-    template<typename X, typename Y> friend class broadcast_cache;
-    template<typename X, typename Y> friend class round_robin_cache;
-    //! receive an item, return a task *if possible
-    graph_task *try_put_task(const T &t) override {
-        buffer_operation op_data(t, put_item);
+private:
+    graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
+        buffer_operation op_data(t, put_item __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         my_aggregator.execute(&op_data);
         graph_task *ft = grab_forwarding_task(op_data);
         // sequencer_nodes can return failure (if an item has been previously inserted)
@@ -1479,6 +1660,22 @@ class buffer_node
         return ft;
     }
 
+protected:
+
+    template< typename R, typename B > friend class run_and_put_task;
+    template<typename X, typename Y> friend class broadcast_cache;
+    template<typename X, typename Y> friend class round_robin_cache;
+    //! receive an item, return a task *if possible
+    graph_task *try_put_task(const T &t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1511,7 +1708,9 @@ class queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task *new_task = this->my_successors.try_put_task(this->front());
+        graph_task* new_task = this->my_successors.try_put_task(this->front()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->front_metainfo()));
+
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1530,7 +1729,14 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->pop_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->pop_front(*(op->elem), *(op->metainfo));
+            } else
+#endif
+            {
+                this->pop_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1539,7 +1745,15 @@ class queue_node : public buffer_node<T> {
             op->status.store(FAILED, std::memory_order_release);
         }
         else {
-            this->reserve_front(*(op->elem));
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            if (op->metainfo) {
+                this->reserve_front(*(op->elem), *(op->metainfo));
+            }
+            else
+#endif
+            {
+                this->reserve_front(*(op->elem));
+            }
             op->status.store(SUCCEEDED, std::memory_order_release);
         }
     }
@@ -1647,7 +1861,13 @@ class sequencer_node : public queue_node<T> {
         }
         this->my_tail = new_tail;
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        bool place_item_result = this->place_item(tag, *(op->elem), *(op->metainfo));
+        const op_stat res = place_item_result ? SUCCEEDED : FAILED;
+#else
         const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED;
+#endif
         op->status.store(res, std::memory_order_release);
         return res ==SUCCEEDED;
     }
@@ -1710,7 +1930,12 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     bool internal_push(prio_operation *op) override {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        __TBB_ASSERT(op->metainfo, nullptr);
+        prio_push(*(op->elem), *(op->metainfo));
+#else
         prio_push(*(op->elem));
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         return true;
     }
@@ -1723,6 +1948,11 @@ class priority_queue_node : public buffer_node<T> {
         }
 
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+        }
+#endif
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
 
@@ -1736,6 +1966,12 @@ class priority_queue_node : public buffer_node<T> {
         }
         this->my_reserved = true;
         *(op->elem) = prio();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        if (op->metainfo) {
+            *(op->metainfo) = std::move(prio_metainfo());
+            reserved_metainfo = *(op->metainfo);
+        }
+#endif
         reserved_item = *(op->elem);
         op->status.store(SUCCEEDED, std::memory_order_release);
         prio_pop();
@@ -1745,13 +1981,27 @@ class priority_queue_node : public buffer_node<T> {
         op->status.store(SUCCEEDED, std::memory_order_release);
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
     void internal_release(prio_operation *op) override {
         op->status.store(SUCCEEDED, std::memory_order_release);
-        prio_push(reserved_item);
+        prio_push(reserved_item __TBB_FLOW_GRAPH_METAINFO_ARG(reserved_metainfo));
         this->my_reserved = false;
         reserved_item = input_type();
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        for (auto waiter : reserved_metainfo.waiters()) {
+            waiter->release(1);
+        }
+
+        reserved_metainfo = message_metainfo{};
+#endif
     }
 
 private:
@@ -1767,7 +2017,8 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     void try_put_and_add_task(graph_task*& last_task) {
-        graph_task * new_task = this->my_successors.try_put_task(this->prio());
+        graph_task* new_task = this->my_successors.try_put_task(this->prio()
+                                                                __TBB_FLOW_GRAPH_METAINFO_ARG(this->prio_metainfo()));
         if (new_task) {
             // workaround for icc bug
             graph& graph_ref = this->graph_reference();
@@ -1781,6 +2032,9 @@ class priority_queue_node : public buffer_node<T> {
     size_type mark;
 
     input_type reserved_item;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo reserved_metainfo;
+#endif
 
     // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item
     bool prio_use_tail() {
@@ -1789,10 +2043,10 @@ class priority_queue_node : public buffer_node<T> {
     }
 
     // prio_push: checks that the item will fit, expand array if necessary, put at end
-    void prio_push(const T &src) {
+    void prio_push(const T &src __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         if ( this->my_tail >= this->my_array_size )
             this->grow_my_array( this->my_tail + 1 );
-        (void) this->place_item(this->my_tail, src);
+        (void) this->place_item(this->my_tail, src __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         ++(this->my_tail);
         __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push");
     }
@@ -1826,6 +2080,12 @@ class priority_queue_node : public buffer_node<T> {
         return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo& prio_metainfo() {
+        return this->get_my_metainfo(prio_use_tail() ? this->my_tail-1 : 0);
+    }
+#endif
+
     // turn array into heap
     void heapify() {
         if(this->my_tail == 0) {
@@ -1836,7 +2096,10 @@ class priority_queue_node : public buffer_node<T> {
         for (; mark<this->my_tail; ++mark) { // for each unheaped element
             size_type cur_pos = mark;
             input_type to_place;
-            this->fetch_item(mark,to_place);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+            message_metainfo metainfo;
+#endif
+            this->fetch_item(mark, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
             do { // push to_place up the heap
                 size_type parent = (cur_pos-1)>>1;
                 if (!compare(this->get_my_item(parent), to_place))
@@ -1844,7 +2107,7 @@ class priority_queue_node : public buffer_node<T> {
                 this->move_item(cur_pos, parent);
                 cur_pos = parent;
             } while( cur_pos );
-            (void) this->place_item(cur_pos, to_place);
+            this->place_item(cur_pos, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo)));
         }
     }
 
@@ -1944,9 +2207,12 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //SUCCESS
         // if we can reserve and can put, we consume the reservation
         // we increment the count and decrement the tries
-        if ( (my_predecessors.try_reserve(v)) == true ) {
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        message_metainfo metainfo;
+#endif
+        if ( (my_predecessors.try_reserve(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) == true ) {
             reserved = true;
-            if ( (rval = my_successors.try_put_task(v)) != nullptr ) {
+            if ( (rval = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) != nullptr ) {
                 {
                     spin_mutex::scoped_lock lock(my_mutex);
                     ++my_count;
@@ -1965,9 +2231,8 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                     if ( check_conditions() ) {
                         if ( is_graph_active(this->my_graph) ) {
                             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
-                            small_object_allocator allocator{};
+                            d1::small_object_allocator allocator{};
                             graph_task* rtask = allocator.new_object<task_type>( my_graph, allocator, *this );
-                            my_graph.reserve_wait();
                             spawn_in_graph_arena(graph_reference(), *rtask);
                         }
                     }
@@ -1984,10 +2249,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
             if (reserved) my_predecessors.try_release();
             if ( check_conditions() ) {
                 if ( is_graph_active(this->my_graph) ) {
-                    small_object_allocator allocator{};
+                    d1::small_object_allocator allocator{};
                     typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                     graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                    my_graph.reserve_wait();
                     __TBB_ASSERT(!rval, "Have two tasks to handle");
                     return t;
                 }
@@ -2035,10 +2299,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         //spawn a forward task if this is the only successor
         if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) {
             if ( is_graph_active(this->my_graph) ) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
                 spawn_in_graph_arena(graph_reference(), *t);
             }
         }
@@ -2049,7 +2312,7 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     /** r.remove_predecessor(*this) is also called. */
     bool remove_successor( successor_type &r ) override {
         // TODO revamp: investigate why qualification is needed for remove_predecessor() call
-        tbb::detail::d1::remove_predecessor(r, *this);
+        tbb::detail::d2::remove_predecessor(r, *this);
         my_successors.remove_successor(r);
         return true;
     }
@@ -2059,10 +2322,9 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         spin_mutex::scoped_lock lock(my_mutex);
         my_predecessors.add( src );
         if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) {
-            small_object_allocator allocator{};
+            d1::small_object_allocator allocator{};
             typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
             graph_task* t = allocator.new_object<task_type>(my_graph, allocator, *this);
-            my_graph.reserve_wait();
             spawn_in_graph_arena(graph_reference(), *t);
         }
         return true;
@@ -2079,8 +2341,10 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     template< typename R, typename B > friend class run_and_put_task;
     template<typename X, typename Y> friend class broadcast_cache;
     template<typename X, typename Y> friend class round_robin_cache;
+
+private:
     //! Puts an item to this receiver
-    graph_task* try_put_task( const T &t ) override {
+    graph_task* try_put_task_impl( const T &t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) {
         {
             spin_mutex::scoped_lock lock(my_mutex);
             if ( my_count + my_tries >= my_threshold )
@@ -2089,15 +2353,14 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
                 ++my_tries;
         }
 
-        graph_task* rtask = my_successors.try_put_task(t);
+        graph_task* rtask = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo));
         if ( !rtask ) {  // try_put_task failed.
             spin_mutex::scoped_lock lock(my_mutex);
             --my_tries;
             if (check_conditions() && is_graph_active(this->my_graph)) {
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef forward_task_bypass<limiter_node<T, DecrementType>> task_type;
                 rtask = allocator.new_object<task_type>(my_graph, allocator, *this);
-                my_graph.reserve_wait();
             }
         }
         else {
@@ -2118,6 +2381,16 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         return rtask;
     }
 
+protected:
+    graph_task* try_put_task(const T& t) override {
+        return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override {
+        return try_put_task_impl(t, metainfo);
+    }
+#endif
+
     graph& graph_reference() const override { return my_graph; }
 
     void reset_node( reset_flags f ) override {
@@ -3054,10 +3327,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
                 // because failed reserve does not mean that register_successor is not ready to put a message immediately.
                 // We have some sort of infinite loop: reserving node tries to set pull state for the edge,
                 // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation.
-                small_object_allocator allocator{};
+                d1::small_object_allocator allocator{};
                 typedef register_predecessor_task task_type;
                 graph_task* t = allocator.new_object<task_type>(graph_reference(), allocator, *this, s);
-                graph_reference().reserve_wait();
                 spawn_in_graph_arena( my_graph, *t );
             }
         } else {
@@ -3082,11 +3354,45 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
         return false;
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( input_type &v, message_metainfo& metainfo ) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+
+            // Since the successor of the node will use move semantics while wrapping the metainfo
+            // that is designed to transfer the ownership of the value from single-push buffer to the task
+            // It is required to reserve one more reference here because the value keeps in the buffer
+            // and the ownership is not transferred
+            for (auto msg_waiter : metainfo.waiters()) {
+                msg_waiter->reserve(1);
+            }
+            return true;
+        }
+        return false;
+    }
+#endif
+
     //! Reserves an item
     bool try_reserve( T &v ) override {
         return try_get(v);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+private:
+    bool try_reserve(T& v, message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        if (my_buffer_is_valid) {
+            v = my_buffer;
+            metainfo = my_buffered_metainfo;
+            return true;
+        }
+        return false;
+    }
+public:
+#endif
+
     //! Releases the reserved item
     bool try_release() override { return true; }
 
@@ -3101,6 +3407,12 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     void clear() {
        spin_mutex::scoped_lock l( my_mutex );
        my_buffer_is_valid = false;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+       for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+           msg_waiter->release(1);
+       }
+       my_buffered_metainfo = message_metainfo{};
+#endif
     }
 
 protected:
@@ -3110,13 +3422,33 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task* try_put_task( const input_type &v ) override {
         spin_mutex::scoped_lock l( my_mutex );
-        return try_put_task_impl(v);
+        return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( my_mutex );
+        return try_put_task_impl(v, metainfo);
     }
+#endif
 
-    graph_task * try_put_task_impl(const input_type &v) {
+    graph_task * try_put_task_impl(const input_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) {
         my_buffer = v;
         my_buffer_is_valid = true;
-        graph_task* rtask = my_successors.try_put_task(v);
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+        // Since the new item is pushed to the buffer - reserving the waiters
+        for (auto msg_waiter : metainfo.waiters()) {
+            msg_waiter->reserve(1);
+        }
+
+        // Since the item is taken out from the buffer - releasing the stored waiters
+        for (auto msg_waiter : my_buffered_metainfo.waiters()) {
+            msg_waiter->release(1);
+        }
+
+        my_buffered_metainfo = metainfo;
+#endif
+        graph_task* rtask = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(my_buffered_metainfo) );
         if (!rtask) rtask = SUCCESSFULLY_ENQUEUED;
         return rtask;
     }
@@ -3128,13 +3460,13 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     //! Breaks an infinite loop between the node reservation and register_successor call
     struct register_predecessor_task : public graph_task {
         register_predecessor_task(
-            graph& g, small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
+            graph& g, d1::small_object_allocator& allocator, predecessor_type& owner, successor_type& succ)
             : graph_task(g, allocator), o(owner), s(succ) {};
 
-        task* execute(execution_data& ed) override {
+        d1::task* execute(d1::execution_data& ed) override {
             // TODO revamp: investigate why qualification is needed for register_successor() call
-            using tbb::detail::d1::register_predecessor;
-            using tbb::detail::d1::register_successor;
+            using tbb::detail::d2::register_predecessor;
+            using tbb::detail::d2::register_successor;
             if ( !register_predecessor(s, o) ) {
                 register_successor(o, s);
             }
@@ -3142,7 +3474,7 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
             return nullptr;
         }
 
-        task* cancel(execution_data& ed) override {
+        d1::task* cancel(d1::execution_data& ed) override {
             finalize<register_predecessor_task>(ed);
             return nullptr;
         }
@@ -3154,6 +3486,9 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     spin_mutex my_mutex;
     broadcast_cache< input_type, null_rw_mutex > my_successors;
     input_type my_buffer;
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    message_metainfo my_buffered_metainfo;
+#endif
     bool my_buffer_is_valid;
 
     void reset_node( reset_flags f) override {
@@ -3200,8 +3535,15 @@ class write_once_node : public overwrite_node<T> {
     template<typename X, typename Y> friend class round_robin_cache;
     graph_task *try_put_task( const T &v ) override {
         spin_mutex::scoped_lock l( this->my_mutex );
-        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v);
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{}));
     }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override {
+        spin_mutex::scoped_lock l( this->my_mutex );
+        return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v, metainfo);
+    }
+#endif
 }; // write_once_node
 
 inline void set_name(const graph& g, const char *name) {
@@ -3293,7 +3635,7 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 {
     fgt_multioutput_node_desc(&node, name);
 }
-} // d1
+} // d2
 } // detail
 } // tbb
 
@@ -3304,56 +3646,56 @@ inline void set_name(const async_node<Input, Output, Policy>& node, const char *
 namespace tbb {
 namespace flow {
 inline namespace v1 {
-    using detail::d1::receiver;
-    using detail::d1::sender;
-
-    using detail::d1::serial;
-    using detail::d1::unlimited;
-
-    using detail::d1::reset_flags;
-    using detail::d1::rf_reset_protocol;
-    using detail::d1::rf_reset_bodies;
-    using detail::d1::rf_clear_edges;
-
-    using detail::d1::graph;
-    using detail::d1::graph_node;
-    using detail::d1::continue_msg;
-
-    using detail::d1::input_node;
-    using detail::d1::function_node;
-    using detail::d1::multifunction_node;
-    using detail::d1::split_node;
-    using detail::d1::output_port;
-    using detail::d1::indexer_node;
-    using detail::d1::tagged_msg;
-    using detail::d1::cast_to;
-    using detail::d1::is_a;
-    using detail::d1::continue_node;
-    using detail::d1::overwrite_node;
-    using detail::d1::write_once_node;
-    using detail::d1::broadcast_node;
-    using detail::d1::buffer_node;
-    using detail::d1::queue_node;
-    using detail::d1::sequencer_node;
-    using detail::d1::priority_queue_node;
-    using detail::d1::limiter_node;
-    using namespace detail::d1::graph_policy_namespace;
-    using detail::d1::join_node;
-    using detail::d1::input_port;
-    using detail::d1::copy_body;
-    using detail::d1::make_edge;
-    using detail::d1::remove_edge;
-    using detail::d1::tag_value;
-    using detail::d1::composite_node;
-    using detail::d1::async_node;
-    using detail::d1::node_priority_t;
-    using detail::d1::no_priority;
+    using detail::d2::receiver;
+    using detail::d2::sender;
+
+    using detail::d2::serial;
+    using detail::d2::unlimited;
+
+    using detail::d2::reset_flags;
+    using detail::d2::rf_reset_protocol;
+    using detail::d2::rf_reset_bodies;
+    using detail::d2::rf_clear_edges;
+
+    using detail::d2::graph;
+    using detail::d2::graph_node;
+    using detail::d2::continue_msg;
+
+    using detail::d2::input_node;
+    using detail::d2::function_node;
+    using detail::d2::multifunction_node;
+    using detail::d2::split_node;
+    using detail::d2::output_port;
+    using detail::d2::indexer_node;
+    using detail::d2::tagged_msg;
+    using detail::d2::cast_to;
+    using detail::d2::is_a;
+    using detail::d2::continue_node;
+    using detail::d2::overwrite_node;
+    using detail::d2::write_once_node;
+    using detail::d2::broadcast_node;
+    using detail::d2::buffer_node;
+    using detail::d2::queue_node;
+    using detail::d2::sequencer_node;
+    using detail::d2::priority_queue_node;
+    using detail::d2::limiter_node;
+    using namespace detail::d2::graph_policy_namespace;
+    using detail::d2::join_node;
+    using detail::d2::input_port;
+    using detail::d2::copy_body;
+    using detail::d2::make_edge;
+    using detail::d2::remove_edge;
+    using detail::d2::tag_value;
+    using detail::d2::composite_node;
+    using detail::d2::async_node;
+    using detail::d2::node_priority_t;
+    using detail::d2::no_priority;
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-    using detail::d1::follows;
-    using detail::d1::precedes;
-    using detail::d1::make_node_set;
-    using detail::d1::make_edges;
+    using detail::d2::follows;
+    using detail::d2::precedes;
+    using detail::d2::make_node_set;
+    using detail::d2::make_edges;
 #endif
 
 } // v1
@@ -3362,7 +3704,7 @@ inline namespace v1 {
     using detail::d1::flow_control;
 
 namespace profiling {
-    using detail::d1::set_name;
+    using detail::d2::set_name;
 } // profiling
 
 } // tbb
diff --git a/include/oneapi/tbb/flow_graph_abstractions.h b/include/oneapi/tbb/flow_graph_abstractions.h
index 121f167c4d..329e75c43e 100644
--- a/include/oneapi/tbb/flow_graph_abstractions.h
+++ b/include/oneapi/tbb/flow_graph_abstractions.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 namespace tbb {
 namespace detail {
-namespace d1 {
+namespace d2 {
 
 //! Pure virtual template classes that define interfaces for async communication
 class graph_proxy {
@@ -43,7 +43,7 @@ class receiver_gateway : public graph_proxy {
     virtual bool try_put(const input_type&) = 0;
 };
 
-} // d1
+} // d2
 
 
 } // detail
diff --git a/include/oneapi/tbb/parallel_for_each.h b/include/oneapi/tbb/parallel_for_each.h
index ab0b345388..85c0269196 100644
--- a/include/oneapi/tbb/parallel_for_each.h
+++ b/include/oneapi/tbb/parallel_for_each.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -118,14 +118,17 @@ struct feeder_item_task: public task {
     using feeder_type = feeder_impl<Body, Item>;
 
     template <typename ItemType>
-    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc) :
+    feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc, wait_tree_vertex_interface& wait_vertex) :
         item(std::forward<ItemType>(input_item)),
         my_feeder(feeder),
-        my_allocator(alloc)
-    {}
+        my_allocator(alloc),
+        m_wait_tree_vertex(r1::get_thread_reference_vertex(&wait_vertex))
+    {
+        m_wait_tree_vertex->reserve();
+    }
 
     void finalize(const execution_data& ed) {
-        my_feeder.my_wait_context.release();
+        m_wait_tree_vertex->release();
         my_allocator.delete_object(this, ed);
     }
 
@@ -160,6 +163,7 @@ struct feeder_item_task: public task {
     Item item;
     feeder_type& my_feeder;
     small_object_allocator my_allocator;
+    wait_tree_vertex_interface* m_wait_tree_vertex;
 }; // class feeder_item_task
 
 /** Implements new task adding procedure.
@@ -170,9 +174,8 @@ class feeder_impl : public feeder<Item> {
     void internal_add_copy_impl(std::true_type, const Item& item) {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc;
-        auto task = alloc.new_object<feeder_task>(item, *this, alloc);
+        auto task = alloc.new_object<feeder_task>(item, *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 
@@ -187,20 +190,19 @@ class feeder_impl : public feeder<Item> {
     void internal_add_move(Item&& item) override {
         using feeder_task = feeder_item_task<Body, Item>;
         small_object_allocator alloc{};
-        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc);
+        auto task = alloc.new_object<feeder_task>(std::move(item), *this, alloc, my_wait_context);
 
-        my_wait_context.reserve();
         spawn(*task, my_execution_context);
     }
 public:
-    feeder_impl(const Body& body, wait_context& w_context, task_group_context &context)
+    feeder_impl(const Body& body, wait_context_vertex& w_context, task_group_context &context)
       : my_body(body),
         my_wait_context(w_context)
       , my_execution_context(context)
     {}
 
     const Body& my_body;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
 }; // class feeder_impl
 
@@ -263,7 +265,7 @@ struct input_block_handling_task : public task {
     using iteration_task_iterator_type = typename input_iteration_task_iterator_helper<Body, Item>::type;
     using iteration_task = for_each_iteration_task<iteration_task_iterator_type, Body, Item>;
 
-    input_block_handling_task(wait_context& root_wait_context, task_group_context& e_context,
+    input_block_handling_task(wait_context_vertex& root_wait_context, task_group_context& e_context,
                               const Body& body, feeder_impl<Body, Item>* feeder_ptr, small_object_allocator& alloc)
         :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context),
          my_execution_context(e_context), my_allocator(alloc)
@@ -312,7 +314,7 @@ struct input_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class input_block_handling_task
@@ -326,7 +328,7 @@ struct forward_block_handling_task : public task {
     using iteration_task = for_each_iteration_task<Iterator, Body, Item>;
 
     forward_block_handling_task(Iterator first, std::size_t size,
-                                wait_context& w_context, task_group_context& e_context,
+                                wait_context_vertex& w_context, task_group_context& e_context,
                                 const Body& body, feeder_impl<Body, Item>* feeder_ptr,
                                 small_object_allocator& alloc)
         : my_size(size), my_wait_context(0), my_root_wait_context(w_context),
@@ -373,7 +375,7 @@ struct forward_block_handling_task : public task {
     aligned_space<iteration_task, max_block_size> task_pool;
     std::size_t my_size;
     wait_context my_wait_context;
-    wait_context& my_root_wait_context;
+    wait_context_vertex& my_root_wait_context;
     task_group_context& my_execution_context;
     small_object_allocator my_allocator;
 }; // class forward_block_handling_task
@@ -456,7 +458,7 @@ using feeder_is_required = tbb::detail::void_t<decltype(tbb::detail::invoke(std:
 // Creates feeder object only if the body can accept it
 template <typename Iterator, typename Body, typename Item, typename = void>
 struct feeder_holder {
-    feeder_holder( wait_context&, task_group_context&, const Body& ) {}
+    feeder_holder( wait_context_vertex&, task_group_context&, const Body& ) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return nullptr; }
 }; // class feeder_holder
@@ -464,7 +466,7 @@ struct feeder_holder {
 template <typename Iterator, typename Body, typename Item>
 class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Item>> {
 public:
-    feeder_holder( wait_context& w_context, task_group_context& context, const Body& body )
+    feeder_holder( wait_context_vertex& w_context, task_group_context& context, const Body& body )
         : my_feeder(body, w_context, context) {}
 
     feeder_impl<Body, Item>* feeder_ptr() { return &my_feeder; }
@@ -475,7 +477,7 @@ class feeder_holder<Iterator, Body, Item, feeder_is_required<Body, Iterator, Ite
 template <typename Iterator, typename Body, typename Item>
 class for_each_root_task_base : public task {
 public:
-    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context& w_context, task_group_context& e_context)
+    for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context_vertex& w_context, task_group_context& e_context)
         : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context),
           my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body)
     {
@@ -489,7 +491,7 @@ class for_each_root_task_base : public task {
 protected:
     Iterator my_first;
     Iterator my_last;
-    wait_context& my_wait_context;
+    wait_context_vertex& my_wait_context;
     task_group_context& my_execution_context;
     const Body& my_body;
     feeder_holder<Iterator, Body, Item> my_feeder_holder;
@@ -624,11 +626,11 @@ void run_parallel_for_each( Iterator first, Iterator last, const Body& body, tas
 {
     if (!(first == last)) {
         using ItemType = get_item_type<Body, typename std::iterator_traits<Iterator>::value_type>;
-        wait_context w_context(0);
+        wait_context_vertex w_context(0);
 
         for_each_root_task<Iterator, Body, ItemType> root_task(first, last, body, w_context, context);
 
-        execute_and_wait(root_task, context, w_context, context);
+        execute_and_wait(root_task, context, w_context.get_context(), context);
     }
 }
 
diff --git a/include/oneapi/tbb/version.h b/include/oneapi/tbb/version.h
index fff3e7e2f9..c8f3ad50e3 100644
--- a/include/oneapi/tbb/version.h
+++ b/include/oneapi/tbb/version.h
@@ -27,9 +27,9 @@
 #endif
 
 // Product version
-#define TBB_VERSION_MAJOR 2021
+#define TBB_VERSION_MAJOR 2022
 // Update version
-#define TBB_VERSION_MINOR 13
+#define TBB_VERSION_MINOR 0
 // "Patch" version for custom releases
 #define TBB_VERSION_PATCH 0
 // Suffix string
@@ -44,7 +44,7 @@
 // OneAPI oneTBB specification version
 #define ONETBB_SPEC_VERSION "1.0"
 // Full interface version
-#define TBB_INTERFACE_VERSION 12130
+#define TBB_INTERFACE_VERSION 12140
 // Major interface version
 #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
 // Minor interface version
diff --git a/include/tbb/blocked_rangeNd.h b/include/tbb/blocked_nd_range.h
similarity index 86%
rename from include/tbb/blocked_rangeNd.h
rename to include/tbb/blocked_nd_range.h
index 0c0fb7303a..70ca73af4b 100644
--- a/include/tbb/blocked_rangeNd.h
+++ b/include/tbb/blocked_nd_range.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2017-2021 Intel Corporation
+    Copyright (c) 2017-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -14,4 +14,4 @@
     limitations under the License.
 */
 
-#include "../oneapi/tbb/blocked_rangeNd.h"
+#include "../oneapi/tbb/blocked_nd_range.h"
diff --git a/integration/linux/modulefiles/tbb b/integration/linux/modulefiles/tbb
index b8c695ed2c..58113ee62c 100644
--- a/integration/linux/modulefiles/tbb
+++ b/integration/linux/modulefiles/tbb
@@ -1,6 +1,6 @@
 #%Module1.0###################################################################
 #
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ module-whatis "Dependencies: none"
 proc ModulesHelp { } {
     global modulefilename
     global modulefilever
-    module whatis "${modulefilename}/${modulefilever}"
+    puts "module whatis ${modulefilename}/${modulefilever}"
 }
 
 ##############################################################################
diff --git a/integration/linux/modulefiles/tbb32 b/integration/linux/modulefiles/tbb32
index db34135176..89d6bc60fe 100644
--- a/integration/linux/modulefiles/tbb32
+++ b/integration/linux/modulefiles/tbb32
@@ -1,6 +1,6 @@
 #%Module1.0###################################################################
 #
-# Copyright (c) 2020-2023 Intel Corporation
+# Copyright (c) 2020-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ module-whatis "Dependencies: none"
 proc ModulesHelp { } {
     global modulefilename
     global modulefilever
-    module whatis "${modulefilename}/${modulefilever}"
+    puts "module whatis ${modulefilename}/${modulefilever}"
 }
 
 ##############################################################################
diff --git a/integration/pkg-config/tbb.pc.in b/integration/pkg-config/tbb.pc.in
index 34ea3bea17..2fe03c72e9 100644
--- a/integration/pkg-config/tbb.pc.in
+++ b/integration/pkg-config/tbb.pc.in
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023 Intel Corporation
+# Copyright (c) 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ includedir=@_includedir_for_pc_file@
 
 Name: oneAPI Threading Building Blocks (oneTBB)
 Description: C++ library for parallel programming on multi-core processors.
-URL: https://github.com/oneapi-src/oneTBB
+URL: https://github.com/uxlfoundation/oneTBB
 Version: @TBB_VERSION@
 Libs: -L${libdir} @_tbb_pc_extra_libdir@ -l@_tbb_pc_lib_name@
 Cflags: -I${includedir}
diff --git a/rfcs/README.md b/rfcs/README.md
index b1ae271b50..831c60d00d 100644
--- a/rfcs/README.md
+++ b/rfcs/README.md
@@ -4,28 +4,28 @@ The RFC process intends to:
 
 - Communicate library-wide changes
 - Collect feedback before implementation
-- Increase transparency in decision-making 
+- Increase transparency in decision-making
 - Align different teams involved in oneTBB development
 
-This directory contains design documents (RFCs) approved 
+This directory contains design documents (RFCs) approved
 or rejected for implementation in oneTBB.
 
 The possible RFC states are:
 
-1. Initial 
+1. Initial
 2. Proposed
 3. Experimental
 4. Supported
 5. Archived
 
-Most modifications or new features will naturally start as a part of a 
-GitHub issue or discussion. Small changes do not require a formal RFC. 
-However, if the issue or discussion results in an idea for a significant 
-change or new feature that affects the library's public API or architecture, 
-we recommend opening a PR  to add a new RFC  to the `rfcs/proposed` directory. 
+Most modifications or new features will naturally start as a part of a
+GitHub issue or discussion. Small changes do not require a formal RFC.
+However, if the issue or discussion results in an idea for a significant
+change or new feature that affects the library's public API or architecture,
+we recommend opening a PR to add a new RFC to the `rfcs/proposed` directory.
 The RFC should provide a detailed description and design of the proposed feature.
-or new feature that significantly impacts the library's public API or 
-architecture, it will be suggested that a PR be opened to add a new rfc 
+or new feature that significantly impacts the library's public API or
+architecture, it will be suggested that a PR be opened to add a new rfc
 to the `rfcs/proposed` directory. The RFC contains a more detailed description
 and design for the feature.
 
@@ -35,47 +35,43 @@ A template for RFCs is available as [template.md](template.md). Place the modifi
 template in the subdirectory of the `rfcs/proposed` with a name
 of the form `<feature>_<extension_description>`. For example,
 a proposal for a new ``my_op`` flow graph node should be put into the
-`rfcs/proposed/flow_graph_my_op_node` directory. Use [template.md](template.md) 
-to create the `README.md` file in that directory. The folder can 
+`rfcs/proposed/flow_graph_my_op_node` directory. Use [template.md](template.md)
+to create the `README.md` file in that directory. The folder can
 contain other files referenced by the `README.md` file, such as figures.
 
 Once two maintainers approve the PR, it is merged into the `rfcs/proposed`
-directory. Update the RFC document with additional information as the RFC moves 
-to different states. 
+directory. Update the RFC document with additional information as the RFC moves
+to different states.
 
-A proposal that is subsequently implemented and released in oneTBB 
+A proposal that is subsequently implemented and released in oneTBB
 as a preview feature is moved into the `rfcs/experimental` folder. The
 RFC for a preview feature in `rfcs/experimental` should include a description
-of what is required to move from experimental to fully supported -- for 
+of what is required to move from experimental to fully supported -- for
 example, feedback from users, demonstrated performance improvements, etc.
 
-A proposal that is implemented, added to the oneTBB specification, and 
-supported as a full feature appears in the `rfcs/supported` directory. An RFC 
-for a fully supported feature in the `rfcs/supported` directory should 
-have a link to the section in the oneTBB specification with its 
+A proposal that is implemented, added to the oneTBB specification, and
+supported as a full feature appears in the `rfcs/supported` directory. An RFC
+for a fully supported feature in the `rfcs/supported` directory should
+have a link to the section in the oneTBB specification with its
 formal wording.
 
-A feature that is removed or a proposal that is abandoned or rejected will 
+A feature that is removed or a proposal that is abandoned or rejected will
 be moved to the `rfcs/archived` folder.
 
 ## Document Style
 
-The design documents are stored in the `rfcs` directory, and each RFC is placed 
-in its subdirectory under `rfcs/proposed/<feature>_<extension_description>`. 
+The design documents are stored in the `rfcs` directory, and each RFC is placed
+in its subdirectory under `rfcs/proposed/<feature>_<extension_description>`.
 
-- There must be a `README.md` file that contains the main RFC itself (or 
+- There must be a `README.md` file that contains the main RFC itself (or
 links to a file that contains it in the same directory).
-  - The RFC should follow the [template.md](template.md) structure. 
-  - The directory can contain other supporting files, such as images, tex 
+  - The RFC should follow the [template.md](template.md) structure.
+  - The directory can contain other supporting files, such as images, tex
   formulas, and sub-proposals / sub-RFCs.
-  - We highly recommend using a text-based file format like markdown for easy 
+  - We highly recommend using a text-based file format like markdown for easy
   collaboration on GitHub, but other formats like PDFs may also be acceptable.
-    template file for writing RFCs. However, it is strongly recommended to use
-    text-based file format that can be rendered by GitHub to allow for easy
-    collaboration using PR comments. Even so, files such as pdfs may be
-    acceptable.
 - For the markdown-written RFC, keep the text width within
-  80 characters, unless there is a reason to violate this rule, e.g., 
+  100 characters, unless there is a reason to violate this rule, e.g.,
   long links or wide tables.
-- It is also recommended to read through existing RFCs to better understand the 
+- It is also recommended to read through existing RFCs to better understand the
 general writing style and required elements.
diff --git a/rfcs/experimental/README.md b/rfcs/experimental/README.md
index 3793e84f74..9dcd233aee 100644
--- a/rfcs/experimental/README.md
+++ b/rfcs/experimental/README.md
@@ -5,24 +5,24 @@ released as preview features in the oneTBB library. A preview
 feature is expected to have an implementation that is of comparable quality
 to a fully supported feature. Sufficient tests are required.
 
-An experimental feature does not yet appear as part of the oneTBB 
+An experimental feature does not yet appear as part of the oneTBB
 specification. Therefore, the interface and design can change.
 There is no commitment to backward compatibility for a preview
 feature.
 
-The documents in this directory 
+The documents in this directory
 should include a list of the exit conditions that need to be met to move from
 preview to fully supported. These conditions might include demonstrated
 performance improvements, demonstrated interest from the community,
-acceptance of the required oneTBB specification changes, etc. 
+acceptance of the required oneTBB specification changes, etc.
 
 For features that require oneTBB specification changes, the document might
 include wording for those changes or a link to any PRs that opened
 against the specification.
 
-Proposals should not remain in the experimental directory forever. The 
+Proposals should not remain in the experimental directory forever.
 It should move either to the
-supported folder when they become fully supported or the archived 
-folder if they are not fully accepted. It should be highly unusual for 
-a proposal to stay in the experimental folder for longer than a year or 
+supported folder when they become fully supported or the archived
+folder if they are not fully accepted. It should be highly unusual for
+a proposal to stay in the experimental folder for longer than a year or
 two.
diff --git a/rfcs/proposed/loading-dependencies/loading-dependencies-by-module-name.org b/rfcs/proposed/loading-dependencies/loading-dependencies-by-module-name.org
new file mode 100644
index 0000000000..c35db3309d
--- /dev/null
+++ b/rfcs/proposed/loading-dependencies/loading-dependencies-by-module-name.org
@@ -0,0 +1,104 @@
+#+title: Loading Dependencies By Module Name
+
+* Introduction
+There is a well-known attack that involves loading of a malicious dependency
+instead of the original one without notice to the party that does this loading.
+In the industry it is usually called /DLL injection/ or /DLL preloading attack/
+and it is mostly associated with the Windows platform as it is known to be
+particularly vulnerable to this kind of attack [1]. One of the recommendations
+that safeguards against this type of attack is to specify fully qualified path
+to a dependency [2].
+
+Historically, oneTBB loads its optional dependencies during its initialization
+process when these dependencies are used for the first time. The way oneTBB does
+this is by building full paths to their dependencies using the path where the
+oneTBB library itself resides. It is the only sensible path which can be
+obtained by oneTBB, whose usage conditions are not known at the time of
+development. The purpose is to minimize the risk of a DLL injection attack issue
+so that only certain paths are probed by the system loader. However,
+dependencies of a dependency are still searched by their module names only [3].
+So, the risk is minimized only for a dependency itself and not for the libraries
+it depends on, not to mention that the file of a dependency can be replaced in
+the file system by an attacker, which breaks even that protection. Besides that,
+loading of a dependency by specifying full path represents an inconvenience to
+the developers who want to make use of their own variant of a dependency. Not
+only they need to place their variant of a dependency to all of the places from
+which it is going to be found and loaded by every client component that depends
+on it, but also this makes problematic the implementation (if not impossible) of
+some scenarios where the dependency being loaded maintains single state shared
+among all its clients. Such scenarios are hard to implement because copies of
+the same DLL loaded from different paths are considered to be different DLLs and
+in certain cases there is no support for filesystem linking mechanism to point
+to a single file [4, 5].
+
+So, what is the main problem due to which loading by a module name makes Windows
+much more vulnerable to DLL injection than Linux?
+
+Besides difference in the order of accessing paths specified in the environment,
+Windows also prioritizes searching in the directory from which the application
+is loaded and current working directory [2]. Assuming that application is loaded
+from a directory that requires administrative permission on write, which is
+usually the case, it is the current working directory that forms the main DLL
+preloading attack scenario [1].
+
+There are approaches to exclude the current working directory from the search
+order. However, for a library to avoid process-wide changes to the search order
+the only viable solution for run-time loading is to pass
+~LOAD_LIBRARY_SAFE_CURRENT_DIRS~ flag to the ~LoadLibraryEx~ Windows API [6].
+
+With the removal of the current working directory from loader's consideration,
+the search order on Windows starts having little difference with the search
+order on Linux. The difference includes the order in which directories specified
+in the environment and system directories are considered, and the presence of
+the first step of looking into an application directory on Windows [2, 7].
+
+Since the system environment variables and the environment of other processes
+cannot be changed, the only vulnerable place is an application directory [8, 9].
+Because the application can be installed in a directory that does not require
+administrative permissions on write, it still can be started by an account
+having them. Unlike Linux systems, for the process started with administrative
+permissions, the paths specified in the environment and the application
+directory are still considered by the Windows system loader [2, 7]. Therefore,
+an attacker can update permissive installation directory with a malicious
+version of a binary, hence making it loaded in a process with elevated
+permissions. Note that specifying fully qualified path to the dependency does
+not help in this case.
+
+Fortunately, there is a signature verification process that helps validating the
+authenticity of a binary before loading it into process address space and
+starting its execution. This allows making use of the established search order
+while checking that genuine version of the dependency is used. However, not
+loading the binary because of the failed signature verification might not be
+always desired. Especially, during the development phase or for a software
+distributor who does not have the signature with which to sign the binary.
+Therefore, to preserve backward compatibility of such usage models, it is
+essential to have the possibility to disable signature verification.
+
+* Proposal
+Based on the analysis in the "Introduction" section and to support versatile
+distribution models of oneTBB this RFC proposes to:
+
+On Windows only:
+1. Introduce signature verification step to the run-time dependency loading
+   process.
+2. Introduce the ~TBB_VERIFY_DEPENDENCY_SIGNATURE~ compilation option that would
+   enable signature verification, and set it ~ON~ by default.
+3. Update documentation to include information about new
+   ~TBB_VERIFY_DEPENDENCY_SIGNATURE~ flag.
+4. Pass ~LOAD_LIBRARY_SAFE_CURRENT_DIRS~ flag to the ~LoadLibraryEx~ calls so
+   that current working directory is excluded from the list of directories in
+   which the system loader looks when trying to find and resolve dependency.
+
+On all OSes:
+- Change dependency loading approach to load by module names only.
+
+* References
+1. [[https://support.microsoft.com/en-us/topic/secure-loading-of-libraries-to-prevent-dll-preloading-attacks-d41303ec-0748-9211-f317-2edc819682e1][Microsoft, "Secure loading of libraries to prevent DLL preloading attacks".]]
+2. [[https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-security][Microsoft, "Dynamic-Link Library Security", 7 January 2021]]
+3. [[https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#factors-that-affect-searching][Microsoft, "Dynamic-link library search order", 9 February 2023]].
+4. [[https://learn.microsoft.com/en-us/windows/win32/dlls/run-time-dynamic-linking][Microsoft, "Run-Time Dynamic Linking", 7 January 2021]]
+5. [[https://github.com/NuGet/Home/issues/10734][NuGet project issue on GitHub, "NuGet packaging should support symlinks within packages", 7 April 2021]]
+6. [[https://learn.microsoft.com/en-us/windows/win32/api/LibLoaderAPI/nf-libloaderapi-loadlibraryexa][Microsoft, "LoadLibraryExA function (libloaderapi.h)", 9 February 2023]]
+7. [[https://www.man7.org/linux/man-pages/man8/ld.so.8.html][Linux man-pages 6.9.1, "ld.so(8) — Linux manual page", 8 May 2024]]
+8. [[https://learn.microsoft.com/en-us/windows/win32/procthread/environment-variables][Microsoft, "Environment Variables", 7 January 2021]]
+9. [[https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-setenvironmentvariable][Microsoft, "SetEnvironmentVariable function (winbase.h)", 23 September 2022]]
diff --git a/rfcs/proposed/parallel_block_for_task_arena/README.md b/rfcs/proposed/parallel_block_for_task_arena/README.md
new file mode 100644
index 0000000000..207c318b4d
--- /dev/null
+++ b/rfcs/proposed/parallel_block_for_task_arena/README.md
@@ -0,0 +1,274 @@
+# Adding API for parallel phase to task_arena to warm-up/retain/release worker threads
+
+## Introduction
+
+In oneTBB, there has never been an API that allows users to block worker threads within the arena.
+This design choice was made to preserve the composability of the application.
+Before PR#1352, workers moved to the thread pool to sleep once there were no arenas with active
+demand. However, PR#1352 introduced a delayed leave behavior to the library that
+results in blocking threads for an _implementation-defined_ duration inside an arena
+if there is no active demand arcoss all arenas. This change significantly
+improved performance for various applications on high thread count systems.
+The main idea is that usually, after one parallel computation ends,
+another will start after some time. The delayed leave behavior is a heuristic to utilize this,
+covering most cases within _implementation-defined_ duration.
+
+However, the new behavior is not the perfect match for all the scenarios:
+* The heuristic of delayed leave is unsuitable for the tasks that are submitted
+  in an unpredictable pattern and/or durations.
+* If oneTBB is used in composable scenarios it is not behaving as
+  a good citizen consuming CPU resources.
+  * For example, if an application runs a series of stages where oneTBB is used for one stage
+    and OpenMP is used for a subsequent stage, there is a chance that oneTBB workers will
+    interfere with OpenMP threads. This interference might result in slight oversubscription,
+    which in turn might lead to underperformance.
+
+So there are two related problems but with different resolutions:
+* Completely disable new behavior for scenarios where the heuristic of delayed leave is unsuitable.
+* Optimize library behavior so customers can benefit from the heuristic of delayed leave but
+  make it possible to indicate that "it is the time for the TBB arena to release threads".
+
+## Proposal
+
+Let's tackle these problems one by one.
+
+### Completely disable new behavior
+
+Let’s consider both “Delayed leave” and “Fast leave” as 2 different states in state machine.<br>
+* The "Delayed leave" heuristic benefits most of the workloads. Therefore, this is the 
+  default behavior for arena. 
+* Workloads that has rather negative performance impact from the heuristic of delayed leave
+  can create an arena in “Fast leave” state.
+
+<img src="completely_disable_new_behavior.png" width=800>
+
+There will be a question that we need to answer:
+* Do we see any value if arena potentially can transition from one to another state?
+
+To answer this question, the following scenarios should be considered:
+* What if different types of workloads are mixed in one application?
+* Different types of arenas can be used for different types of workloads.
+
+### When threads should leave?
+
+oneTBB itself can only guess when the ideal time to release threads from the arena is.
+Therefore, it does its best effort to preserve and enhance performance without completely
+messing up composability guarantees (that is how delayed leave is implemented).
+
+As we already discussed, there are cases where it does not work perfectly,
+therefore customers that want to further optimize this
+aspect of oneTBB behavior should be able to do it.
+
+This problem can be considered from another angle. Essentially, if the user can indicate
+where parallel computation ends, they can also indicate where it starts.
+
+<img src="parallel_phase_introduction.png" width=800>
+
+With this approach, the user not only releases threads when necessary but also specifies a
+programmable block where worker threads should expect new work coming regularly
+to the executing arena.
+
+Let’s add a new state to the existing state machine. To represent "Parallel Phase" state.
+
+> **_NOTE:_** The "Fast leave" state is colored Grey just for simplicity of the chart.
+              Let's assume that arena was created with the "Delayed leave". 
+              The logic demonstrated below is applicable to the "Fast leave" as well.
+
+<img src="parallel_phase_state_initial.png" width=800>
+
+This state diagram leads to several questions:
+* What if there are multiple Parallel Phases?
+* If “End of Parallel Phase” leads back to “Delayed leave” how soon will threads
+  be released from arena?
+  * What if we indicated that threads should leave arena after the "Parallel Phase"?
+  * What if we just indicated the end of the "Parallel Phase"?
+
+The extended state machine aims to answer these questions.
+* The first call to the “Start of Phase” will transition into the “Parallel Phase” state.
+* The last call to the “End of Phase” will transition back to the “Delayed leave” state
+  or into the "One-time Fast leave" if it is indicated that threads should leave sooner.
+* Concurrent or nested calls to the “Start of Phase” or the “End of Phase”
+  increment/decrement a reference counter.
+
+<img src="parallel_phase_state_final.png" width=800>
+
+Let's consider the semantics that an API for explicit parallel phases can provide:
+* Start of a parallel phase:
+  * Indicates the point from which the scheduler can use a hint and keep threads in the arena
+    for longer.
+  * Serves as a warm-up hint to the scheduler:
+    * Allows reducing delays of computation start by initiating the wake-up of worker threads
+      in advance.
+* "Parallel phase" itself:
+  * Scheduler can implement different policies to retain threads in the arena.
+    * For instance, more aggressive policy might be implemented for _parallel phase_.
+      It can be beneficial in cases when the default arena leave policy is not sufficient enough.
+  * The semantics for retaining threads is a hint to the scheduler;
+    thus, no real guarantee is provided. The scheduler can ignore the hint and
+    move threads to another arena or to sleep if conditions are met.
+* End of a parallel phase:
+  * Indicates the point from which the scheduler may drop the hint and
+    no longer retain threads in the arena.
+  * Indicates that worker threads should avoid busy-waiting once there is no more work in the arena.
+    * Temporarily overrides the default arena leave policy, which will be restored when
+      new work is submitted.
+
+
+### Proposed API
+
+Summary of API changes:
+
+* Add enumeration class for the arena leave policy.
+* Add the policy as the last parameter to the arena constructor and initializer
+defaulted to "automatic".
+* Add functions to start and end the parallel phase to the `task_arena` class
+and the `this_task_arena` namespace.
+* Add RAII class to map a parallel phase to a code scope.
+
+```cpp
+class task_arena {
+    enum class leave_policy : /* unspecified type */ {
+        automatic = /* unspecifed */,
+        fast = /* unspecifed */,
+    };
+
+    task_arena(int max_concurrency = automatic, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal,
+               leave_policy a_leave_policy = leave_policy::automatic);
+
+    task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1,
+               priority a_priority = priority::normal,
+               leave_policy a_leave_policy = leave_policy::automatic);
+
+    void initialize(int max_concurrency, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal,
+                    leave_policy a_leave_policy = leave_policy::automatic);
+
+    void initialize(constraints a_constraints, unsigned reserved_for_masters = 1,
+                    priority a_priority = priority::normal,
+                    leave_policy a_leave_policy = leave_policy::automatic);
+
+    void start_parallel_phase();
+    void end_parallel_phase(bool with_fast_leave = false);
+
+    class scoped_parallel_phase {
+        scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false);
+    };
+};
+
+namespace this_task_arena {
+    void start_parallel_phase();
+    void end_parallel_phase(bool with_fast_leave = false);
+}
+```
+The _parallel phase_ continues until each previous `start_parallel_phase` call
+to the same arena has a matching `end_parallel_phase` call.<br>
+Let's introduce RAII scoped object that will help to manage the contract.
+
+If the end of the parallel phase is not indicated by the user, it will be done automatically when
+the last public reference is removed from the arena (i.e., task_arena has been destroyed or,
+for an implicitly created arena, the thread that owns it has completed).
+This ensures correctness is preserved (threads will not be retained forever).
+
+### Examples
+
+Following code snippets show how the new API can be used.
+
+```cpp
+void task_arena_leave_policy_example() {
+    tbb::task_arena ta{tbb::task_arena::automatic, 1, priority::normal, leave_policy::fast};
+    ta.execute([]() {
+        // Parallel computation
+    });
+    // Different parallel runtime is used
+    // so it is preferred that worker threads won't be retained
+    // in the arena at this point.
+    #pragma omp parallel for
+    for (int i = 0; i < work_size; ++i) {
+        // Computation
+    }
+}
+
+void parallel_phase_example() {
+    tbb::this_task_arena::start_parallel_phase();
+    tbb::parallel_for(0, work_size, [] (int idx) {
+        // User defined body
+    });
+
+    // Some serial computation
+
+    tbb::parallel_for(0, work_size, [] (int idx) {
+        // User defined body
+    });
+    tbb::this_task_arena::end_parallel_phase(/*with_fast_leave=*/true);
+
+    // Different parallel runtime (for example, OpenMP) is used
+    // so it is preferred that worker threads won't be retained
+    // in the arena at this point.
+    #pragma omp parallel for
+    for (int i = 0; i < work_size; ++i) {
+        // Computation
+    }
+}
+
+void scoped_parallel_phase_example() {
+    tbb::task_arena ta{/*arena constraints*/};
+    {
+        // Start of the parallel phase
+        tbb::task_arena::scoped_parallel_phase phase{ta, /*with_fast_leave=*/true};
+        ta.execute([]() {
+            // Parallel computation
+        });
+
+        // Serial computation
+
+        ta.execute([]() {
+            // Parallel computation
+        });
+    } // End of the parallel phase
+
+    // Different parallel runtime (for example, OpenMP) is used
+    // so it is preferred that worker threads won't be retained
+    // in the arena at this point.
+    #pragma omp parallel for
+    for (int i = 0; i < work_size; ++i) {
+        // Computation
+    }
+}
+
+```
+
+## Considerations
+
+The alternative approaches were also considered.<br>
+We can express this state machine as complete graph and provide low-level interface that
+will give control over state transition.
+
+<img src="alternative_proposal.png" width=600>
+
+We considered this approach too low-level. Plus, it leaves a question: "How to manage concurrent changes of the state?".
+
+The retaining of worker threads should be implemented with care because
+it might introduce performance problems if:
+* Threads cannot migrate to another arena because they are
+  retained in the current arena.
+* Compute resources are not homogeneous, e.g., the CPU is hybrid.
+  Heavier involvement of less performant core types might result in artificial work
+  imbalance in the arena.
+
+
+## Open Questions in Design
+
+Some open questions that remain:
+* Are the suggested APIs sufficient?
+  * In the current version of proposed API, the `scoped_parallel_phase` object can be created
+    only for already existing `task_arena`. Should it be possible for `this_task_arena` as well?
+  * What should be expected from "Parallel Phase" API for `this_task_arena` when a calling thread
+    doesn't yet have any associated arena?
+  * Should parallel phase API be limited only to RAII-only style?
+    * Are there any scenarios where inconvenience of handling `scoped_parallel_phase` object is
+      not acceptable?
+* Are there additional use cases that should be considered that we missed in our analysis?
+* Do we see any value if arena potentially can transition from one to another state?
+  * What if different types of workloads are mixed in one application?
+  * What if there concurrent calls to this API?
diff --git a/rfcs/proposed/parallel_block_for_task_arena/alternative_proposal.png b/rfcs/proposed/parallel_block_for_task_arena/alternative_proposal.png
new file mode 100644
index 0000000000..654fa39fbf
Binary files /dev/null and b/rfcs/proposed/parallel_block_for_task_arena/alternative_proposal.png differ
diff --git a/rfcs/proposed/parallel_block_for_task_arena/completely_disable_new_behavior.png b/rfcs/proposed/parallel_block_for_task_arena/completely_disable_new_behavior.png
new file mode 100644
index 0000000000..a0b6fc2bda
Binary files /dev/null and b/rfcs/proposed/parallel_block_for_task_arena/completely_disable_new_behavior.png differ
diff --git a/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_introduction.png b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_introduction.png
new file mode 100644
index 0000000000..30fb1cdfa3
Binary files /dev/null and b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_introduction.png differ
diff --git a/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_final.png b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_final.png
new file mode 100644
index 0000000000..7faadaf04e
Binary files /dev/null and b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_final.png differ
diff --git a/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_initial.png b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_initial.png
new file mode 100644
index 0000000000..29952ba2ea
Binary files /dev/null and b/rfcs/proposed/parallel_block_for_task_arena/parallel_phase_state_initial.png differ
diff --git a/rfcs/template.md b/rfcs/template.md
index cf407e78fb..9c589e60b9 100644
--- a/rfcs/template.md
+++ b/rfcs/template.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Short description of the idea proposed with explained motivation. 
+Short description of the idea proposed with explained motivation.
 
 The motivation could be:
 - Improved users experience for API changes and extensions. Code snippets to
@@ -21,12 +21,12 @@ A full and detailed description of the proposal with highlighted consequences.
 Depending on the kind of the proposal, the description should cover:
 
 - New use cases supported by the extension.
-- The expected performance benefit for a modification. 
-- The interface of extensions including class definitions or function 
+- The expected performance benefit for a modification.
+- The interface of extensions including class definitions or function
 declarations.
 
-A proposal should clearly outline the alternatives that were considered, 
-along with their pros and cons. Each alternative should be clearly separated 
+A proposal should clearly outline the alternatives that were considered,
+along with their pros and cons. Each alternative should be clearly separated
 to make discussions easier to follow.
 
 Pay close attention to the following aspects of the library:
@@ -50,19 +50,7 @@ Some other common subsections here are:
   without the original proposal, you can have it in the RFC.
 - Execution plan (next steps), if approved.
 
-## Process Specific Information
-
-Depending on the state of the proposal, additional information should be
-included.
+## Open Questions
 
 For new proposals (i.e., those in the `rfcs/proposed` directory), list any
 open questions.
-
-For proposals released as preview features that are in the `rfcs/experimental`
-directory, list the exit conditions to move from preview to fully supported.
-These conditions might include demonstrated performance improvements,
-acceptance of specification changes, etc.
-
-For proposals in the `rfcs/supported` directory, provide a link to the
-any section(s) in the oneTBB specification that related to the proposal.
-For modifications that do not affect the public API, no link is needed.
diff --git a/src/tbb/CMakeLists.txt b/src/tbb/CMakeLists.txt
index b996c736a7..8c84a0b29b 100644
--- a/src/tbb/CMakeLists.txt
+++ b/src/tbb/CMakeLists.txt
@@ -126,6 +126,25 @@ target_link_libraries(tbb
     ${TBB_COMMON_LINK_LIBS}
 )
 
+# Strip debug symbols into a separate .dbg file
+if(TBB_LINUX_SEPARATE_DBG)
+    if(NOT CMAKE_BUILD_TYPE STREQUAL "release")
+        find_program(OBJCOPY_COMMAND objcopy)
+        if(NOT OBJCOPY_COMMAND)
+            message(WARNING "objcopy command not found in the system")
+        else()
+            add_custom_command(TARGET tbb POST_BUILD
+                COMMAND objcopy --only-keep-debug $<TARGET_FILE:tbb> $<TARGET_FILE:tbb>.dbg
+                COMMAND objcopy --strip-debug $<TARGET_FILE:tbb>
+                COMMAND objcopy --add-gnu-debuglink=$<TARGET_FILE:tbb>.dbg $<TARGET_FILE:tbb>
+                COMMENT "Creating and associating .dbg file with tbb"
+            )
+        endif()
+    else()
+        message(WARNING " TBB_LINUX_SEPARATE_DBG flag is not used on release config")
+    endif()
+endif()
+
 if(TBB_BUILD_APPLE_FRAMEWORKS)
     set_target_properties(tbb PROPERTIES
         FRAMEWORK TRUE
@@ -158,7 +177,13 @@ if (TBB_INSTALL)
                 COMPONENT devel
         )
     endif()
-
+    if(TBB_LINUX_SEPARATE_DBG)
+        install(FILES
+                $<TARGET_FILE:tbb>.dbg
+                DESTINATION lib
+                COMPONENT devel
+        )
+    endif()
     set(_tbb_pc_lib_name tbb)
 
     if (WIN32)
diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp
index 0e7cf43c3b..6ca062d02f 100644
--- a/src/tbb/arena.cpp
+++ b/src/tbb/arena.cpp
@@ -195,8 +195,6 @@ void arena::process(thread_data& tls) {
         return;
     }
 
-    my_tc_client.get_pm_client()->register_thread();
-
     __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" );
     tls.attach_arena(*this, index);
     // worker thread enters the dispatch loop to look for a work
@@ -236,8 +234,6 @@ void arena::process(thread_data& tls) {
     __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr);
     __TBB_ASSERT(is_alive(my_guard), nullptr);
 
-    my_tc_client.get_pm_client()->unregister_thread();
-
     // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible
     // that arena may be temporarily left unpopulated by threads. See comments in
     // arena::on_thread_leaving() for more details.
@@ -503,6 +499,7 @@ struct task_arena_impl {
     static void wait(d1::task_arena_base&);
     static int max_concurrency(const d1::task_arena_base*);
     static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*);
+    static d1::slot_id execution_slot(const d1::task_arena_base&);
 };
 
 void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) {
@@ -533,6 +530,10 @@ void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::t
     task_arena_impl::enqueue(t, &ctx, ta);
 }
 
+d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base& arena) {
+    return task_arena_impl::execution_slot(arena);
+}
+
 void task_arena_impl::initialize(d1::task_arena_base& ta) {
     // Enforce global market initialization to properly initialize soft limit
     (void)governor::get_thread_data();
@@ -559,7 +560,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
         ta.my_numa_id, ta.core_type(), ta.max_threads_per_core());
     if (observer) {
         // TODO: Consider lazy initialization for internal arena so
-        // the direct calls to observer might be omitted until actual initialization. 
+        // the direct calls to observer might be omitted until actual initialization.
         observer->on_scheduler_entry(true);
     }
 #endif /*__TBB_CPUBIND_PRESENT*/
@@ -624,6 +625,14 @@ void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_a
      a->enqueue_task(t, *ctx, *td);
 }
 
+d1::slot_id task_arena_impl::execution_slot(const d1::task_arena_base& ta) {
+    thread_data* td = governor::get_thread_data_if_initialized();
+    if (td && (td->is_attached_to(ta.my_arena.load(std::memory_order_relaxed)))) {
+        return td->my_arena_index;
+    }
+    return d1::slot_id(-1);
+}
+
 class nested_arena_context : no_copy {
 public:
     nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index)
@@ -633,9 +642,11 @@ class nested_arena_context : no_copy {
             m_orig_arena = td.my_arena;
             m_orig_slot_index = td.my_arena_index;
             m_orig_last_observer = td.my_last_observer;
+            m_orig_is_thread_registered = td.my_is_registered;
 
             td.detach_task_dispatcher();
             td.attach_arena(nested_arena, slot_index);
+            td.my_is_registered = false;
             if (td.my_inbox.is_idle_state(true))
                 td.my_inbox.set_is_idle(false);
             task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
@@ -686,7 +697,7 @@ class nested_arena_context : no_copy {
             td.leave_task_dispatcher();
             td.my_arena_slot->release();
             td.my_arena->my_exit_monitors.notify_one(); // do not relax!
-
+            td.my_is_registered = m_orig_is_thread_registered;
             td.attach_arena(*m_orig_arena, m_orig_slot_index);
             td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp);
             __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
@@ -702,6 +713,7 @@ class nested_arena_context : no_copy {
     unsigned            m_orig_slot_index{};
     bool                m_orig_fifo_tasks_allowed{};
     bool                m_orig_critical_task_allowed{};
+    bool                m_orig_is_thread_registered{};
 };
 
 class delegated_task : public d1::task {
diff --git a/src/tbb/concurrent_monitor.h b/src/tbb/concurrent_monitor.h
index 3e5c4bebe8..cacfb145a8 100644
--- a/src/tbb/concurrent_monitor.h
+++ b/src/tbb/concurrent_monitor.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -291,14 +291,14 @@ class concurrent_monitor_base {
             if (n != end) {
                 my_waitset.remove(*n);
 
-// GCC 12.x-13.x issues a warning here that to_wait_node(n)->my_is_in_list might have size 0, since n is
+// GCC 12.x-14.x issues a warning here that to_wait_node(n)->my_is_in_list might have size 0, since n is
 // a base_node pointer. (This cannot happen, because only wait_node pointers are added to my_waitset.)
-#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
                 to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed);
-#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 140000 ) && !__clang__ && !__INTEL_COMPILER
+#if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER
 #pragma GCC diagnostic pop
 #endif
             }
diff --git a/src/tbb/def/lin32-tbb.def b/src/tbb/def/lin32-tbb.def
index c9582a73d2..737e8ec2af 100644
--- a/src/tbb/def/lin32-tbb.def
+++ b/src/tbb/def/lin32-tbb.def
@@ -106,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEi;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
@@ -160,4 +161,3 @@ local:
 /* TODO: fill more precisely */
 *;
 };
-
diff --git a/src/tbb/def/lin64-tbb.def b/src/tbb/def/lin64-tbb.def
index 003350b1b7..41aca2e932 100644
--- a/src/tbb/def/lin64-tbb.def
+++ b/src/tbb/def/lin64-tbb.def
@@ -106,6 +106,7 @@ _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE;
 _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE;
+_ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE;
 
 /* System topology parsing and threads pinning (governor.cpp) */
 _ZN3tbb6detail2r115numa_node_countEv;
diff --git a/src/tbb/def/mac64-tbb.def b/src/tbb/def/mac64-tbb.def
index f8d7ed6bb6..38bc48d30e 100644
--- a/src/tbb/def/mac64-tbb.def
+++ b/src/tbb/def/mac64-tbb.def
@@ -108,6 +108,7 @@ __ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE
 __ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE
+__ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE
 
 # System topology parsing and threads pinning (governor.cpp)
 __ZN3tbb6detail2r115numa_node_countEv
@@ -157,4 +158,3 @@ __ZN3tbb6detail2r121notify_by_address_allEPv
 # Versioning (version.cpp)
 _TBB_runtime_interface_version
 _TBB_runtime_version
-
diff --git a/src/tbb/def/win32-tbb.def b/src/tbb/def/win32-tbb.def
index c7c09e62f4..94b5441701 100644
--- a/src/tbb/def/win32-tbb.def
+++ b/src/tbb/def/win32-tbb.def
@@ -100,6 +100,7 @@ EXPORTS
 ?terminate@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?wait@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z
 ?enqueue@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@PAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGABVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/src/tbb/def/win64-tbb.def b/src/tbb/def/win64-tbb.def
index 0fb46c2933..96bafc0163 100644
--- a/src/tbb/def/win64-tbb.def
+++ b/src/tbb/def/win64-tbb.def
@@ -100,6 +100,7 @@ EXPORTS
 ?isolate_within_arena@r1@detail@tbb@@YAXAEAVdelegate_base@d1@23@_J@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@PEAVtask_arena_base@523@@Z
 ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@PEAVtask_arena_base@523@@Z
+?execution_slot@r1@detail@tbb@@YAGAEBVtask_arena_base@d1@23@@Z
 
 ; System topology parsing and threads pinning (governor.cpp)
 ?numa_node_count@r1@detail@tbb@@YAIXZ
diff --git a/src/tbb/governor.cpp b/src/tbb/governor.cpp
index 218a2bc533..4b417b4043 100644
--- a/src/tbb/governor.cpp
+++ b/src/tbb/governor.cpp
@@ -151,17 +151,16 @@ bool governor::does_client_join_workers(const rml::tbb_client &client) {
     3) If the user app strives to conserve the memory by cutting stack size, it
     should do this for TBB workers too (as in the #1).
 */
-static std::uintptr_t get_stack_base(std::size_t stack_size) {
+static void get_stack_attributes(std::uintptr_t& stack_base, std::size_t& stack_size, std::size_t fallback_stack_size) {
     // Stacks are growing top-down. Highest address is called "stack base",
     // and the lowest is "stack limit".
+    stack_size = fallback_stack_size;
 #if __TBB_USE_WINAPI
-    suppress_unused_warning(stack_size);
     NT_TIB* pteb = (NT_TIB*)NtCurrentTeb();
     __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB");
-    return reinterpret_cast<std::uintptr_t>(pteb->StackBase);
+    stack_base = reinterpret_cast<std::uintptr_t>(pteb->StackBase);
 #elif defined(EMSCRIPTEN)
-    suppress_unused_warning(stack_size);
-    return reinterpret_cast<std::uintptr_t>(emscripten_stack_get_base());
+    stack_base = reinterpret_cast<std::uintptr_t>(emscripten_stack_get_base());
 #else
     // There is no portable way to get stack base address in Posix, so we use
     // non-portable method (on all modern Linux) or the simplified approach
@@ -176,11 +175,12 @@ static std::uintptr_t get_stack_base(std::size_t stack_size) {
     if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) {
         if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) {
             __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" );
+            if (np_stack_size > 0)
+                stack_size = np_stack_size;
         }
         pthread_attr_destroy(&np_attr_stack);
     }
 #endif /* __linux__ */
-    std::uintptr_t stack_base{};
     if (stack_limit) {
         stack_base = reinterpret_cast<std::uintptr_t>(stack_limit) + stack_size;
     } else {
@@ -188,7 +188,6 @@ static std::uintptr_t get_stack_base(std::size_t stack_size) {
         int anchor{};
         stack_base = reinterpret_cast<std::uintptr_t>(&anchor);
     }
-    return stack_base;
 #endif /* __TBB_USE_WINAPI */
 }
 
@@ -219,8 +218,8 @@ void governor::init_external_thread() {
     td.attach_arena(a, /*slot index*/ 0);
     __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr);
 
-    stack_size = a.my_threading_control->worker_stack_size();
-    std::uintptr_t stack_base = get_stack_base(stack_size);
+    std::uintptr_t stack_base{};
+    get_stack_attributes(stack_base, stack_size, a.my_threading_control->worker_stack_size());
     task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher();
     td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size));
 
diff --git a/src/tbb/misc_ex.cpp b/src/tbb/misc_ex.cpp
index 13b7b04fb1..03b33b464f 100644
--- a/src/tbb/misc_ex.cpp
+++ b/src/tbb/misc_ex.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -297,11 +297,21 @@ static void initialize_hardware_concurrency_info () {
         if ( pam & m )
             ++nproc;
     }
-    __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr);
+    int number_of_processors = (int)si.dwNumberOfProcessors;
+    if (nproc > number_of_processors && TBB_GetThreadGroupAffinity) {
+        // Sometimes on systems with multiple processor groups GetNativeSystemInfo
+        // reports mask and processor count from the parent process
+        TBB_GROUP_AFFINITY ga;
+        if (TBB_GetThreadGroupAffinity(GetCurrentThread(), &ga)) {
+            number_of_processors = (int)TBB_GetActiveProcessorCount(ga.Group);
+        }
+    }
+
+    __TBB_ASSERT( nproc <= number_of_processors, nullptr);
     // By default setting up a number of processors for one processor group
     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
-    if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
+    if ( nproc == number_of_processors && TBB_GetActiveProcessorCount ) {
         // The process does not have restricting affinity mask and multiple processor groups are possible
         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr);
diff --git a/src/tbb/queuing_rw_mutex.cpp b/src/tbb/queuing_rw_mutex.cpp
index 8818c51a20..5826592b74 100644
--- a/src/tbb/queuing_rw_mutex.cpp
+++ b/src/tbb/queuing_rw_mutex.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -608,7 +608,7 @@ bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock&
     return queuing_rw_mutex_impl::downgrade_to_reader(s);
 }
 
-void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) {
+TBB_EXPORT void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) {
     queuing_rw_mutex_impl::construct(m);
 }
 
diff --git a/src/tbb/scheduler_common.h b/src/tbb/scheduler_common.h
index 06ec543e6b..e4686e1673 100644
--- a/src/tbb/scheduler_common.h
+++ b/src/tbb/scheduler_common.h
@@ -397,7 +397,7 @@ struct suspend_point_type {
 
     void finilize_resume() {
         m_stack_state.store(stack_state::active, std::memory_order_relaxed);
-        // Set the suspended state for the stack that we left. If the state is already notified, it means that 
+        // Set the suspended state for the stack that we left. If the state is already notified, it means that
         // someone already tried to resume our previous stack but failed. So, we need to resume it.
         // m_prev_suspend_point might be nullptr when destroying co_context based on threads
         if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) {
diff --git a/src/tbb/semaphore.h b/src/tbb/semaphore.h
index 9d27f3ac98..d4eadc2cf1 100644
--- a/src/tbb/semaphore.h
+++ b/src/tbb/semaphore.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -98,7 +98,11 @@ namespace r1 {
 #if __TBB_USE_FUTEX
 
 static inline int futex_wait( void *futex, int comparand ) {
+#ifdef __OpenBSD__
+    int r = ::futex((volatile uint32_t *)futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr);
+#else
     int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr, 0);
+#endif
 #if TBB_USE_ASSERT
     int e = errno;
     __TBB_ASSERT(r == 0 || r == EWOULDBLOCK || (r == -1 && (e == EAGAIN || e == EINTR)), "futex_wait failed.");
@@ -107,7 +111,11 @@ static inline int futex_wait( void *futex, int comparand ) {
 }
 
 static inline int futex_wakeup_one( void *futex ) {
+#ifdef __OpenBSD__
+    int r = ::futex((volatile uint32_t *)futex, __TBB_FUTEX_WAKE, 1 , nullptr, nullptr);
+#else
     int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAKE, 1, nullptr, nullptr, 0);
+#endif
     __TBB_ASSERT(r == 0 || r == 1, "futex_wakeup_one: more than one thread woken up?");
     return r;
 }
diff --git a/src/tbb/task.cpp b/src/tbb/task.cpp
index fde41980a0..84b4278f0a 100644
--- a/src/tbb/task.cpp
+++ b/src/tbb/task.cpp
@@ -255,4 +255,3 @@ d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex
 } // namespace r1
 } // namespace detail
 } // namespace tbb
-
diff --git a/src/tbb/task_dispatcher.h b/src/tbb/task_dispatcher.h
index 20c7c731a7..c818934e5a 100644
--- a/src/tbb/task_dispatcher.h
+++ b/src/tbb/task_dispatcher.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -249,15 +249,21 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
         task_dispatcher& task_disp;
         execution_data_ext old_execute_data_ext;
         properties old_properties;
+        bool is_initially_registered;
 
         ~dispatch_loop_guard() {
             task_disp.m_execute_data_ext = old_execute_data_ext;
             task_disp.m_properties = old_properties;
 
+            if (!is_initially_registered) {
+                task_disp.m_thread_data->my_arena->my_tc_client.get_pm_client()->unregister_thread();
+                task_disp.m_thread_data->my_is_registered = false;
+            }
+
             __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr);
             __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr);
         }
-    } dl_guard{ *this, m_execute_data_ext, m_properties };
+    } dl_guard{ *this, m_execute_data_ext, m_properties, m_thread_data->my_is_registered };
 
     // The context guard to track fp setting and itt tasks.
     context_guard_helper</*report_tasks=*/ITTPossible> context_guard;
@@ -282,6 +288,11 @@ d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) {
     m_properties.outermost = false;
     m_properties.fifo_tasks_allowed = false;
 
+    if (!dl_guard.is_initially_registered) {
+        m_thread_data->my_arena->my_tc_client.get_pm_client()->register_thread();
+        m_thread_data->my_is_registered = true;
+    }
+
     t = get_critical_task(t, ed, isolation, critical_allowed);
     if (t && m_thread_data->my_inbox.is_idle_state(true)) {
         // The thread has a work to do. Therefore, marking its inbox as not idle so that
diff --git a/src/tbb/tcm.h b/src/tbb/tcm.h
index 05fe0434eb..66ee18a2f0 100644
--- a/src/tbb/tcm.h
+++ b/src/tbb/tcm.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2023-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -50,7 +50,8 @@ typedef struct _tcm_permit_flags_t {
   uint32_t stale : 1;
   uint32_t rigid_concurrency : 1;
   uint32_t exclusive : 1;
-  uint32_t reserved : 29;
+  uint32_t request_as_inactive : 1;
+  uint32_t reserved : 28;
 } tcm_permit_flags_t;
 
 typedef struct _tcm_callback_flags_t {
diff --git a/src/tbb/tcm_adaptor.cpp b/src/tbb/tcm_adaptor.cpp
index e20ebb831d..85ca125b4e 100644
--- a/src/tbb/tcm_adaptor.cpp
+++ b/src/tbb/tcm_adaptor.cpp
@@ -170,7 +170,7 @@ class tcm_client : public pm_client {
         __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
     }
 
-    void init(d1::constraints& constraints) {
+    void init(tcm_client_id_t client_id, d1::constraints& constraints) {
         __TBB_ASSERT(tcm_request_permit, nullptr);
         __TBB_ASSERT(tcm_deactivate_permit, nullptr);
 
@@ -190,6 +190,12 @@ class tcm_client : public pm_client {
 
         my_permit_request.min_sw_threads = 0;
         my_permit_request.max_sw_threads = 0;
+        my_permit_request.flags.request_as_inactive = 1;
+
+        tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr);
+        __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr);
+
+        my_permit_request.flags.request_as_inactive = 0;
     }
 
     void register_thread() override {
@@ -279,7 +285,7 @@ pm_client* tcm_adaptor::create_client(arena& a) {
 }
 
 void tcm_adaptor::register_client(pm_client* c, d1::constraints& constraints) {
-    static_cast<tcm_client*>(c)->init(constraints);
+    static_cast<tcm_client*>(c)->init(my_impl->client_id, constraints);
 }
 
 void tcm_adaptor::unregister_and_destroy_client(pm_client& c) {
diff --git a/src/tbb/thread_data.h b/src/tbb/thread_data.h
index 9dfa492a72..422ec694ec 100644
--- a/src/tbb/thread_data.h
+++ b/src/tbb/thread_data.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2020-2023 Intel Corporation
+    Copyright (c) 2020-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -101,6 +101,7 @@ class thread_data : public ::rml::job
     thread_data(unsigned short index, bool is_worker)
         : my_arena_index{ index }
         , my_is_worker{ is_worker }
+        , my_is_registered { false }
         , my_task_dispatcher{ nullptr }
         , my_arena{ nullptr }
         , my_last_client{ nullptr }
@@ -145,6 +146,8 @@ class thread_data : public ::rml::job
     //! Indicates if the thread is created by RML
     const bool my_is_worker;
 
+    bool my_is_registered;
+
     //! The current task dipsatcher
     task_dispatcher* my_task_dispatcher;
 
diff --git a/src/tbbbind/tbb_bind.cpp b/src/tbbbind/tbb_bind.cpp
index bb52e11517..143d143344 100644
--- a/src/tbbbind/tbb_bind.cpp
+++ b/src/tbbbind/tbb_bind.cpp
@@ -37,6 +37,10 @@
 
 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500)
+#define __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE (_WIN32 && HWLOC_API_VERSION >= 0x20500)
+#if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE
+    #include <hwloc/windows.h>
+#endif
 
 // Most of hwloc calls returns negative exit code on error.
 // This macro tracks error codes that are returned from the hwloc interfaces.
@@ -58,6 +62,7 @@ class system_topology {
     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
     std::size_t number_of_processors_groups{1};
+    std::vector<hwloc_cpuset_t> processor_groups_affinity_masks_list{};
 
     // NUMA API related topology members
     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
@@ -76,7 +81,7 @@ class system_topology {
 
     // Binding threads that locate in another Windows Processor groups
     // is allowed only if machine topology contains several Windows Processors groups
-    // and process affinity mask wasn't limited manually (affinity mask cannot violates
+    // and process affinity mask wasn't limited manually (affinity mask cannot violate
     // processors group boundaries).
     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
 
@@ -232,6 +237,27 @@ class system_topology {
         }
     }
 
+#if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE
+    void processor_groups_topology_parsing() {
+        __TBB_ASSERT(number_of_processors_groups > 1, nullptr);
+        processor_groups_affinity_masks_list.resize(number_of_processors_groups);
+        for (unsigned group = 0; group < number_of_processors_groups; ++group) {
+            processor_groups_affinity_masks_list[group] = hwloc_bitmap_alloc();
+            assertion_hwloc_wrapper(hwloc_windows_get_processor_group_cpuset, topology, group,
+                                    processor_groups_affinity_masks_list[group], /*flags*/0);
+        }
+
+#if TBB_USE_ASSERT
+        affinity_mask tmp = hwloc_bitmap_alloc();
+        for (auto proc_group_mask : processor_groups_affinity_masks_list) {
+            __TBB_ASSERT(!hwloc_bitmap_intersects(tmp, proc_group_mask), "Masks of processor groups intersect.");
+            hwloc_bitmap_or(tmp, tmp, proc_group_mask);
+        }
+        hwloc_bitmap_free(tmp);
+#endif
+    }
+#endif
+
     void enforce_hwloc_2_5_runtime_linkage() {
         // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading.
         // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5
@@ -244,7 +270,7 @@ class system_topology {
 #endif
     }
 
-  
+
     void initialize( std::size_t groups_num ) {
         if ( initialization_state != uninitialized )
             return;
@@ -252,6 +278,11 @@ class system_topology {
         topology_initialization(groups_num);
         numa_topology_parsing();
         core_types_topology_parsing();
+#if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE
+        if (intergroup_binding_allowed(groups_num)) {
+           processor_groups_topology_parsing();
+        }
+#endif
 
         enforce_hwloc_2_5_runtime_linkage();
 
@@ -293,6 +324,10 @@ class system_topology {
                 hwloc_bitmap_free(core_type_mask);
             }
 
+            for (auto& processor_group : processor_groups_affinity_masks_list) {
+                hwloc_bitmap_free(processor_group);
+            }
+
             hwloc_bitmap_free(process_node_affinity_mask);
             hwloc_bitmap_free(process_cpu_affinity_mask);
         }
@@ -369,6 +404,32 @@ class system_topology {
         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
     }
 
+    /**
+     * Finds processor group for the passed slot number, which are from 0 to max concurrency - 1, by
+     * traversing masks of processor groups one by one, intersecting them with the constrained mask.
+     * Once total weight of processor groups united mask is greater than the slot number, the mask
+     * of the last traversed processor group is returned, denoting the mask to apply to the thread
+     * occupying given slot number.
+     */
+    void fit_to_processor_group(affinity_mask result_mask, affinity_mask constraints_mask, std::size_t slot_num) {
+        __TBB_ASSERT(number_of_processors_groups > 1, nullptr);
+        hwloc_bitmap_zero(result_mask);
+        int constraints_mask_weight = hwloc_bitmap_weight(constraints_mask);
+        // Map slot number to a number within constraints mask if
+        // max concurrency is greater than weight of the mask.
+        slot_num %= constraints_mask_weight;
+        std::size_t total_weight = 0;
+        for (auto& processor_group : processor_groups_affinity_masks_list) {
+            if (hwloc_bitmap_intersects(constraints_mask, processor_group)) {
+                hwloc_bitmap_and(result_mask, processor_group, constraints_mask);
+                total_weight += hwloc_bitmap_weight(result_mask);
+                if (slot_num < total_weight) {
+                    return;     // Corresponding processor group where to bind the thread is found
+                }
+            }
+        }
+    }
+
     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
 
@@ -407,7 +468,7 @@ class system_topology {
 system_topology* system_topology::instance_ptr{nullptr};
 
 class binding_handler {
-    // Following vector saves thread affinity mask on scheduler entry to return it to this thread 
+    // Following vector saves thread affinity mask on scheduler entry to return it to this thread
     // on scheduler exit.
     typedef std::vector<system_topology::affinity_mask> affinity_masks_container;
     affinity_masks_container affinity_backup;
@@ -459,26 +520,28 @@ class binding_handler {
             "Trying to get access to uninitialized system_topology");
 
         topology.store_current_affinity_mask(affinity_backup[slot_num]);
-
+        system_topology::affinity_mask thread_affinity = handler_affinity_mask;
 #ifdef _WIN32
-        // TBBBind supports only systems where NUMA nodes and core types do not cross the border
-        // between several processor groups. So if a certain NUMA node or core type constraint
-        // specified, then the constraints affinity mask will not cross the processor groups' border.
-
-        // But if we have constraint based only on the max_threads_per_core setting, then the
-        // constraints affinity mask does may cross the border between several processor groups
-        // on machines with more then 64 hardware threads. That is why we need to use the special
+        // If we have a constraint based only on the max_threads_per_core setting, then the
+        // constraints affinity mask may cross the border between several processor groups
+        // on systems with more then 64 logical processors. That is why we need to use the special
         // function, which regulates the number of threads in the current threads mask.
+        bool is_default_numa = my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1;
+        bool is_default_core_type = my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1;
         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
-            (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
-            (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
+            is_default_numa && is_default_core_type
         ) {
             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
-            topology.set_affinity_mask(affinity_buffer[slot_num]);
-            return;
+            thread_affinity = affinity_buffer[slot_num];
+        }
+    #if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE
+        else if (topology.number_of_processors_groups > 1) {
+            topology.fit_to_processor_group(affinity_buffer[slot_num], handler_affinity_mask, slot_num);
+            thread_affinity = affinity_buffer[slot_num];
         }
+    #endif
 #endif
-        topology.set_affinity_mask(handler_affinity_mask);
+        topology.set_affinity_mask(thread_affinity);
     }
 
     void restore_previous_affinity_mask( unsigned slot_num ) {
@@ -528,7 +591,7 @@ TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_
     return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
 }
 
-void __TBB_internal_destroy_system_topology() {
+TBBBIND_EXPORT void __TBB_internal_destroy_system_topology() {
     return system_topology::destroy();
 }
 
diff --git a/src/tbbmalloc/frontend.cpp b/src/tbbmalloc/frontend.cpp
index 77f9d6594e..f05aff23ad 100644
--- a/src/tbbmalloc/frontend.cpp
+++ b/src/tbbmalloc/frontend.cpp
@@ -817,6 +817,7 @@ unsigned int getSmallObjectIndex(unsigned int size)
 /*
  * Depending on indexRequest, for a given size return either the index into the bin
  * for objects of this size, or the actual size of objects in this bin.
+ * TODO: Change return type to unsigned short.
  */
 template<bool indexRequest>
 static unsigned int getIndexOrObjectSize (unsigned int size)
@@ -1581,6 +1582,7 @@ void Block::initEmptyBlock(TLSData *tls, size_t size)
     unsigned int objSz = getObjectSize(size);
 
     cleanBlockHeader();
+    MALLOC_ASSERT(objSz <= USHRT_MAX, "objSz must not be less 2^16-1");
     objectSize = objSz;
     markOwned(tls);
     // bump pointer should be prepared for first allocation - thus mode it down to objectSize
@@ -2949,7 +2951,7 @@ extern "C" void scalable_free(void *object)
 }
 
 #if MALLOC_ZONE_OVERLOAD_ENABLED
-extern "C" void __TBB_malloc_free_definite_size(void *object, size_t size)
+extern "C" TBBMALLOC_EXPORT void __TBB_malloc_free_definite_size(void *object, size_t size)
 {
     internalPoolFree(defaultMemPool, object, size);
 }
diff --git a/src/tbbmalloc/large_objects.h b/src/tbbmalloc/large_objects.h
index 8519784267..58d7c81a7b 100644
--- a/src/tbbmalloc/large_objects.h
+++ b/src/tbbmalloc/large_objects.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -81,18 +81,25 @@ struct HugeBinStructureProps {
 
     static size_t alignToBin(size_t size) {
         MALLOC_ASSERT(size >= StepFactor, "Size must not be less than the StepFactor");
-        size_t minorStepExp = BitScanRev(size) - StepFactorExp;
+
+        int sizeExp = (int)BitScanRev(size);
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
+        int minorStepExp = sizeExp - StepFactorExp;
+
         return alignUp(size, 1ULL << minorStepExp);
     }
 
     // Sizes between the power of 2 values are approximated to StepFactor.
     static int sizeToIdx(size_t size) {
         MALLOC_ASSERT(MinSize <= size && size <= MaxSize, ASSERT_TEXT);
+
         int sizeExp = (int)BitScanRev(size); // same as __TBB_Log2
-        MALLOC_ASSERT(sizeExp >= 0, "A shift amount (sizeExp) must not be negative");
-        size_t majorStepSize = 1ULL << sizeExp;
+        MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0");
+        MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor");
         int minorStepExp = sizeExp - StepFactorExp;
-        MALLOC_ASSERT(minorStepExp >= 0, "A shift amount (minorStepExp) must not be negative");
+
+        size_t majorStepSize = 1ULL << sizeExp;
         int minorIdx = (size - majorStepSize) >> minorStepExp;
         MALLOC_ASSERT(size == majorStepSize + ((size_t)minorIdx << minorStepExp),
             "Size is not aligned on the bin");
diff --git a/src/tbbmalloc/tbbmalloc_internal.h b/src/tbbmalloc/tbbmalloc_internal.h
index 44fa47aaab..bc0ee2ffb5 100644
--- a/src/tbbmalloc/tbbmalloc_internal.h
+++ b/src/tbbmalloc/tbbmalloc_internal.h
@@ -232,9 +232,13 @@ template<unsigned NUM>
 class BitMaskMax : public BitMaskBasic<NUM> {
 public:
     void set(size_t idx, bool val) {
+        MALLOC_ASSERT(NUM >= idx + 1, ASSERT_TEXT);
+
         BitMaskBasic<NUM>::set(NUM - 1 - idx, val);
     }
     int getMaxTrue(unsigned startIdx) const {
+        MALLOC_ASSERT(NUM >= startIdx + 1, ASSERT_TEXT);
+
         int p = BitMaskBasic<NUM>::getMinTrue(NUM-startIdx-1);
         return -1==p? -1 : (int)NUM - 1 - p;
     }
@@ -496,7 +500,11 @@ class HugePagesStatus {
         MALLOC_ASSERT(!pageSize, "Huge page size can't be set twice. Double initialization.");
 
         // Initialize object variables
-        pageSize       = hugePageSize * 1024; // was read in KB from meminfo
+        if (hugePageSize > -1) {
+            pageSize = hugePageSize * 1024; // was read in KB from meminfo
+        } else {
+            pageSize = 0;
+        }
         isHPAvailable  = hpAvailable;
         isTHPAvailable = thpAvailable;
     }
diff --git a/src/tbbmalloc_proxy/proxy.cpp b/src/tbbmalloc_proxy/proxy.cpp
index a6d3dea06f..d27591bb49 100644
--- a/src/tbbmalloc_proxy/proxy.cpp
+++ b/src/tbbmalloc_proxy/proxy.cpp
@@ -133,7 +133,7 @@ static inline void initPageSize()
    2) check that dlsym("malloc") found something different from our replacement malloc
 */
 
-extern "C" void *__TBB_malloc_proxy(size_t) __TBB_ALIAS_ATTR_COPY(malloc);
+extern "C" TBBMALLOCPROXY_EXPORT void *__TBB_malloc_proxy(size_t) __TBB_ALIAS_ATTR_COPY(malloc);
 
 static void *orig_msize;
 
@@ -184,23 +184,23 @@ inline void InitOrigPointers() {}
 
 #endif // MALLOC_UNIXLIKE_OVERLOAD_ENABLED and MALLOC_ZONE_OVERLOAD_ENABLED
 
-void *PREFIX(malloc)(ZONE_ARG size_t size) __THROW
+TBBMALLOCPROXY_EXPORT void *PREFIX(malloc)(ZONE_ARG size_t size) __THROW
 {
     return scalable_malloc(size);
 }
 
-void *PREFIX(calloc)(ZONE_ARG size_t num, size_t size) __THROW
+TBBMALLOCPROXY_EXPORT void *PREFIX(calloc)(ZONE_ARG size_t num, size_t size) __THROW
 {
     return scalable_calloc(num, size);
 }
 
-void PREFIX(free)(ZONE_ARG void *object) __THROW
+TBBMALLOCPROXY_EXPORT void PREFIX(free)(ZONE_ARG void *object) __THROW
 {
     InitOrigPointers();
     __TBB_malloc_safer_free(object, (void (*)(void*))orig_free);
 }
 
-void *PREFIX(realloc)(ZONE_ARG void* ptr, size_t sz) __THROW
+TBBMALLOCPROXY_EXPORT void *PREFIX(realloc)(ZONE_ARG void* ptr, size_t sz) __THROW
 {
     InitOrigPointers();
     return __TBB_malloc_safer_realloc(ptr, sz, orig_realloc);
@@ -209,13 +209,13 @@ void *PREFIX(realloc)(ZONE_ARG void* ptr, size_t sz) __THROW
 /* The older *NIX interface for aligned allocations;
    it's formally substituted by posix_memalign and deprecated,
    so we do not expect it to cause cyclic dependency with C RTL. */
-void *PREFIX(memalign)(ZONE_ARG size_t alignment, size_t size) __THROW
+TBBMALLOCPROXY_EXPORT void *PREFIX(memalign)(ZONE_ARG size_t alignment, size_t size) __THROW
 {
     return scalable_aligned_malloc(size, alignment);
 }
 
 /* valloc allocates memory aligned on a page boundary */
-void *PREFIX(valloc)(ZONE_ARG size_t size) __THROW
+TBBMALLOCPROXY_EXPORT void *PREFIX(valloc)(ZONE_ARG size_t size) __THROW
 {
     if (! memoryPageSize) initPageSize();
 
@@ -229,23 +229,23 @@ void *PREFIX(valloc)(ZONE_ARG size_t size) __THROW
 
 // match prototype from system headers
 #if __ANDROID__
-size_t malloc_usable_size(const void *ptr) __THROW
+TBBMALLOCPROXY_EXPORT size_t malloc_usable_size(const void *ptr) __THROW
 #else
-size_t malloc_usable_size(void *ptr) __THROW
+TBBMALLOCPROXY_EXPORT size_t malloc_usable_size(void *ptr) __THROW
 #endif
 {
     InitOrigPointers();
     return __TBB_malloc_safer_msize(const_cast<void*>(ptr), (size_t (*)(void*))orig_msize);
 }
 
-int posix_memalign(void **memptr, size_t alignment, size_t size) __THROW
+TBBMALLOCPROXY_EXPORT int posix_memalign(void **memptr, size_t alignment, size_t size) __THROW
 {
     return scalable_posix_memalign(memptr, alignment, size);
 }
 
 /* pvalloc allocates smallest set of complete pages which can hold
    the requested number of bytes. Result is aligned on page boundary. */
-void *pvalloc(size_t size) __THROW
+TBBMALLOCPROXY_EXPORT void *pvalloc(size_t size) __THROW
 {
     if (! memoryPageSize) initPageSize();
     // align size up to the page size,
@@ -255,13 +255,13 @@ void *pvalloc(size_t size) __THROW
     return scalable_aligned_malloc(size, memoryPageSize);
 }
 
-int mallopt(int /*param*/, int /*value*/) __THROW
+TBBMALLOCPROXY_EXPORT int mallopt(int /*param*/, int /*value*/) __THROW
 {
     return 1;
 }
 
 #if defined(__GLIBC__) || defined(__ANDROID__)
-struct mallinfo mallinfo() __THROW
+TBBMALLOCPROXY_EXPORT struct mallinfo mallinfo() __THROW
 {
     struct mallinfo m;
     memset(&m, 0, sizeof(struct mallinfo));
@@ -274,30 +274,30 @@ struct mallinfo mallinfo() __THROW
 // Android doesn't have malloc_usable_size, provide it to be compatible
 // with Linux, in addition overload dlmalloc_usable_size() that presented
 // under Android.
-size_t dlmalloc_usable_size(const void *ptr) __TBB_ALIAS_ATTR_COPY(malloc_usable_size);
+TBBMALLOCPROXY_EXPORT size_t dlmalloc_usable_size(const void *ptr) __TBB_ALIAS_ATTR_COPY(malloc_usable_size);
 #else // __ANDROID__
 // TODO: consider using __typeof__ to guarantee the correct declaration types
 // C11 function, supported starting GLIBC 2.16
-void *aligned_alloc(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign);
+TBBMALLOCPROXY_EXPORT void *aligned_alloc(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign);
 // Those non-standard functions are exported by GLIBC, and might be used
 // in conjunction with standard malloc/free, so we must overload them.
 // Bionic doesn't have them. Not removing from the linker scripts,
 // as absent entry points are ignored by the linker.
 
-void *__libc_malloc(size_t size) __TBB_ALIAS_ATTR_COPY(malloc);
-void *__libc_calloc(size_t num, size_t size) __TBB_ALIAS_ATTR_COPY(calloc);
-void *__libc_memalign(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign);
-void *__libc_pvalloc(size_t size) __TBB_ALIAS_ATTR_COPY(pvalloc);
-void *__libc_valloc(size_t size) __TBB_ALIAS_ATTR_COPY(valloc);
+TBBMALLOCPROXY_EXPORT void *__libc_malloc(size_t size) __TBB_ALIAS_ATTR_COPY(malloc);
+TBBMALLOCPROXY_EXPORT void *__libc_calloc(size_t num, size_t size) __TBB_ALIAS_ATTR_COPY(calloc);
+TBBMALLOCPROXY_EXPORT void *__libc_memalign(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign);
+TBBMALLOCPROXY_EXPORT void *__libc_pvalloc(size_t size) __TBB_ALIAS_ATTR_COPY(pvalloc);
+TBBMALLOCPROXY_EXPORT void *__libc_valloc(size_t size) __TBB_ALIAS_ATTR_COPY(valloc);
 
 // call original __libc_* to support naive replacement of free via __libc_free etc
-void __libc_free(void *ptr)
+TBBMALLOCPROXY_EXPORT void __libc_free(void *ptr)
 {
     InitOrigPointers();
     __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_libc_free);
 }
 
-void *__libc_realloc(void *ptr, size_t size)
+TBBMALLOCPROXY_EXPORT void *__libc_realloc(void *ptr, size_t size)
 {
     InitOrigPointers();
     return __TBB_malloc_safer_realloc(ptr, size, orig_libc_realloc);
@@ -308,31 +308,31 @@ void *__libc_realloc(void *ptr, size_t size)
 
 /*** replacements for global operators new and delete ***/
 
-void* operator new(size_t sz) {
+TBBMALLOCPROXY_EXPORT void* operator new(size_t sz) {
     return InternalOperatorNew(sz);
 }
-void* operator new[](size_t sz) {
+TBBMALLOCPROXY_EXPORT void* operator new[](size_t sz) {
     return InternalOperatorNew(sz);
 }
-void operator delete(void* ptr) noexcept {
+TBBMALLOCPROXY_EXPORT void operator delete(void* ptr) noexcept {
     InitOrigPointers();
     __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free);
 }
-void operator delete[](void* ptr) noexcept {
+TBBMALLOCPROXY_EXPORT void operator delete[](void* ptr) noexcept {
     InitOrigPointers();
     __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free);
 }
-void* operator new(size_t sz, const std::nothrow_t&) noexcept {
+TBBMALLOCPROXY_EXPORT void* operator new(size_t sz, const std::nothrow_t&) noexcept {
     return scalable_malloc(sz);
 }
-void* operator new[](std::size_t sz, const std::nothrow_t&) noexcept {
+TBBMALLOCPROXY_EXPORT void* operator new[](std::size_t sz, const std::nothrow_t&) noexcept {
     return scalable_malloc(sz);
 }
-void operator delete(void* ptr, const std::nothrow_t&) noexcept {
+TBBMALLOCPROXY_EXPORT void operator delete(void* ptr, const std::nothrow_t&) noexcept {
     InitOrigPointers();
     __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free);
 }
-void operator delete[](void* ptr, const std::nothrow_t&) noexcept {
+TBBMALLOCPROXY_EXPORT void operator delete[](void* ptr, const std::nothrow_t&) noexcept {
     InitOrigPointers();
     __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free);
 }
@@ -431,14 +431,12 @@ void __TBB_malloc__free_base(void *ptr)
 const char* known_bytecodes[] = {
 #if _WIN64
 //  "========================================================" - 56 symbols
+    "E9********CCCC",         // multiple - jmp(0xE9) with address followed by empty space (0xCC - INT 3)
     "4883EC284885C974",       // release free()
     "4883EC284885C975",       // release _msize()
     "4885C974375348",         // release free() 8.0.50727.42, 10.0
-    "E907000000CCCC",         // release _aligned_msize(), _aligned_free() ucrtbase.dll
     "C7442410000000008B",     // release free() ucrtbase.dll 10.0.14393.33
-    "E90B000000CCCC",         // release _msize() ucrtbase.dll 10.0.14393.33
     "48895C24085748",         // release _aligned_msize() ucrtbase.dll 10.0.14393.33
-    "E903000000CCCC",         // release _aligned_msize() ucrtbase.dll 10.0.16299.522
     "48894C24084883EC28BA",   // debug prologue
     "4C894424184889542410",   // debug _aligned_msize() 10.0
     "48894C24084883EC2848",   // debug _aligned_free 10.0
diff --git a/src/tbbmalloc_proxy/proxy.h b/src/tbbmalloc_proxy/proxy.h
index 5f0133f9e0..d45def3e5f 100644
--- a/src/tbbmalloc_proxy/proxy.h
+++ b/src/tbbmalloc_proxy/proxy.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ extern "C" {
     TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize( void *ptr, size_t, size_t, size_t (*orig_msize_crt80d)(void*,size_t,size_t));
 
 #if MALLOC_ZONE_OVERLOAD_ENABLED
-    void   __TBB_malloc_free_definite_size(void *object, size_t size);
+    TBBMALLOC_EXPORT void   __TBB_malloc_free_definite_size(void *object, size_t size);
 #endif
 } // extern "C"
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fb4a78bdbb..0ab4d7e8c8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -537,7 +537,7 @@ if (TARGET TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_range DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_range2d DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_blocked_range3d DEPENDENCIES TBB::tbb)
-    tbb_add_test(SUBDIR conformance NAME conformance_blocked_rangeNd DEPENDENCIES TBB::tbb)
+    tbb_add_test(SUBDIR conformance NAME conformance_blocked_nd_range DEPENDENCIES TBB::tbb)
     tbb_add_test(SUBDIR conformance NAME conformance_concurrent_vector DEPENDENCIES TBB::tbb)
     if (NOT TBB_TCM_TESTING)
         tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb)
diff --git a/test/common/common_arena_constraints.h b/test/common/common_arena_constraints.h
index 4f2da92022..22c0d05309 100644
--- a/test/common/common_arena_constraints.h
+++ b/test/common/common_arena_constraints.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2019-2023 Intel Corporation
+    Copyright (c) 2019-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -560,16 +560,19 @@ constraints_container generate_constraints_variety() {
 #endif /*__HYBRID_CPUS_TESTING*/
         }
 
+        int max_threads_per_core = system_info::get_available_max_threads_values().back();
         // Some constraints may cause unexpected behavior, which would be fixed later.
         if (get_processors_group_count() > 1) {
-            for(auto it = results.begin(); it != results.end(); ++it) {
-                if (it->max_threads_per_core != tbb::task_arena::automatic
+            for(auto it = results.begin(); it != results.end();) {
+                if (it->max_threads_per_core != max_threads_per_core
                    && (it->numa_id == tbb::task_arena::automatic || tbb::info::numa_nodes().size() == 1)
 #if __HYBRID_CPUS_TESTING
                    && (it->core_type == tbb::task_arena::automatic || tbb::info::core_types().size() == 1)
 #endif /*__HYBRID_CPUS_TESTING*/
                 ) {
                     it = results.erase(it);
+                } else {
+                    ++it;
                 }
             }
         }
diff --git a/test/common/config.h b/test/common/config.h
index aa75790c3f..c7ff8ba63a 100644
--- a/test/common/config.h
+++ b/test/common/config.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -36,9 +36,6 @@
 #ifndef TBB_PREVIEW_VARIADIC_PARALLEL_INVOKE
 #define TBB_PREVIEW_VARIADIC_PARALLEL_INVOKE 1
 #endif
-#ifndef TBB_PREVIEW_BLOCKED_RANGE_ND
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#endif
 #ifndef TBB_PREVIEW_ISOLATED_TASK_GROUP
 #define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
 #endif
diff --git a/test/common/graph_utils.h b/test/common/graph_utils.h
index 24814d5fd3..2c2099f6df 100644
--- a/test/common/graph_utils.h
+++ b/test/common/graph_utils.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@
 
 #include "common/spin_barrier.h"
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 // Needed conversion to and from continue_msg, but didn't want to add
 // conversion operators to the class, since we don't want it in general,
@@ -277,11 +277,17 @@ struct harness_counting_receiver : public tbb::flow::receiver<T> {
         return my_graph;
     }
 
-    tbb::detail::d1::graph_task *try_put_task( const T & ) override {
+    tbb::detail::d2::graph_task *try_put_task( const T & ) override {
       ++my_count;
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
+    }
+#endif
+
     void validate() {
         size_t n = my_count;
         CHECK( n == num_copies*max_value );
@@ -323,14 +329,20 @@ struct harness_mapped_receiver : public tbb::flow::receiver<T> {
        my_multiset = new multiset_type;
     }
 
-    tbb::detail::d1::graph_task* try_put_task( const T &t ) override {
+    tbb::detail::d2::graph_task* try_put_task( const T &t ) override {
       if ( my_multiset ) {
           (*my_multiset).emplace( t );
       } else {
           ++my_count;
       }
-      return const_cast<tbb::detail::d1::graph_task*>(SUCCESSFULLY_ENQUEUED);
+      return const_cast<tbb::detail::d2::graph_task*>(SUCCESSFULLY_ENQUEUED);
+    }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::detail::d2::graph_task *try_put_task( const T &t, const tbb::detail::d2::message_metainfo& ) override {
+      return try_put_task(t);
     }
+#endif
 
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
@@ -404,6 +416,12 @@ struct harness_counting_sender : public tbb::flow::sender<T> {
         }
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    bool try_get( T & v, tbb::detail::d2::message_metainfo& ) override {
+        return try_get(v);
+    }
+#endif
+
     bool try_put_once() {
         successor_type *s = my_receiver;
         size_t i = my_count++;
@@ -842,7 +860,7 @@ struct throwing_body{
         if(my_counter == Threshold)
             throw Threshold;
     }
-    
+
     template<typename input_type>
     output_tuple_type operator()(const input_type&) {
         ++my_counter;
diff --git a/test/common/utils_dynamic_libs.h b/test/common/utils_dynamic_libs.h
index 5e5365fc8f..99afca3840 100644
--- a/test/common/utils_dynamic_libs.h
+++ b/test/common/utils_dynamic_libs.h
@@ -58,7 +58,7 @@ namespace utils {
 #define EXT ".dylib"
 #endif
 // Android SDK build system does not support .so file name versioning
-#elif __FreeBSD__ || __NetBSD__ || __sun || _AIX || __ANDROID__
+#elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__
 #define EXT ".so"
 #elif __unix__  // Order of these elif's matters!
 #define EXT __TBB_STRING(.so.2)
diff --git a/test/conformance/conformance_blocked_rangeNd.cpp b/test/conformance/conformance_blocked_nd_range.cpp
similarity index 79%
rename from test/conformance/conformance_blocked_rangeNd.cpp
rename to test/conformance/conformance_blocked_nd_range.cpp
index 52faac52ca..999eacb3c6 100644
--- a/test/conformance/conformance_blocked_rangeNd.cpp
+++ b/test/conformance/conformance_blocked_nd_range.cpp
@@ -19,11 +19,10 @@
 #include "common/utils_assert.h"
 #include "common/utils_concurrency_limit.h"
 
-//! \file conformance_blocked_rangeNd.cpp
-//! \brief Test for [preview] functionality
+//! \file conformance_blocked_nd_range.cpp
+//! \brief Test for [algorithms.blocked_nd_range] specification
 
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#include "oneapi/tbb/blocked_rangeNd.h"
+#include "oneapi/tbb/blocked_nd_range.h"
 #include "oneapi/tbb/parallel_for.h"
 #include "oneapi/tbb/global_control.h"
 
@@ -160,10 +159,10 @@ int MakeInt(int i) { return i; }
 
 template<unsigned int DimAmount>
 void SerialTest() {
-    static_assert((oneapi::tbb::blocked_rangeNd<int, DimAmount>::ndims() == oneapi::tbb::blocked_rangeNd<AbstractValueType, DimAmount>::ndims()),
+    static_assert((oneapi::tbb::blocked_nd_range<int, DimAmount>::dim_count() == oneapi::tbb::blocked_nd_range<AbstractValueType, DimAmount>::dim_count()),
                          "different amount of dimensions");
 
-    using range_t = oneapi::tbb::blocked_rangeNd<AbstractValueType, DimAmount>;
+    using range_t = oneapi::tbb::blocked_nd_range<AbstractValueType, DimAmount>;
     using utils_t = range_utils<range_t, DimAmount>;
 
     // Generate empty range
@@ -171,7 +170,7 @@ void SerialTest() {
 
     utils::AssertSameType(r.is_divisible(), bool());
     utils::AssertSameType(r.empty(), bool());
-    utils::AssertSameType(range_t::ndims(), 0U);
+    utils::AssertSameType(range_t::dim_count(), 0U);
 
     REQUIRE((r.empty() == utils_t::is_empty(r) && r.empty()));
     REQUIRE(r.is_divisible() == utils_t::is_divisible(r));
@@ -190,7 +189,7 @@ template<> void SerialTest<0>() {}
 
 template<unsigned int DimAmount>
 void ParallelTest() {
-    using range_t = oneapi::tbb::blocked_rangeNd<int, DimAmount>;
+    using range_t = oneapi::tbb::blocked_nd_range<int, DimAmount>;
     using utils_t = range_utils<range_t, DimAmount>;
 
     // Max size is                                 1 << 20 - 1 bytes
@@ -210,28 +209,45 @@ void ParallelTest() {
 }
 template<> void ParallelTest<0>() {}
 
-//! Testing blocked_rangeNd construction
+//! Testing blocked_nd_range construction
 //! \brief \ref interface
 TEST_CASE("Construction") {
-    oneapi::tbb::blocked_rangeNd<int, 1>{ { 0,13,3 } };
+    oneapi::tbb::blocked_nd_range<int, 1>{ { 0,13,3 } };
 
-    oneapi::tbb::blocked_rangeNd<int, 1>{ oneapi::tbb::blocked_range<int>{ 0,13,3 } };
+    oneapi::tbb::blocked_nd_range<int, 1>{ oneapi::tbb::blocked_range<int>{ 0,13,3 } };
 
-    oneapi::tbb::blocked_rangeNd<int, 2>(oneapi::tbb::blocked_range<int>(-8923, 8884, 13), oneapi::tbb::blocked_range<int>(-8923, 5, 13));
+    oneapi::tbb::blocked_nd_range<int, 2>(oneapi::tbb::blocked_range<int>(-8923, 8884, 13), oneapi::tbb::blocked_range<int>(-8923, 5, 13));
 
-    oneapi::tbb::blocked_rangeNd<int, 2>({ -8923, 8884, 13 }, { -8923, 8884, 13 });
+    oneapi::tbb::blocked_nd_range<int, 2>({ -8923, 8884, 13 }, { -8923, 8884, 13 });
 
     oneapi::tbb::blocked_range<int> r1(0, 13);
 
     oneapi::tbb::blocked_range<int> r2(-12, 23);
 
-    oneapi::tbb::blocked_rangeNd<int, 2>({ { -8923, 8884, 13 }, r1});
+    oneapi::tbb::blocked_nd_range<int, 2>({ { -8923, 8884, 13 }, r1});
 
-    oneapi::tbb::blocked_rangeNd<int, 2>({ r2, r1 });
+    oneapi::tbb::blocked_nd_range<int, 2>({ r2, r1 });
 
-    oneapi::tbb::blocked_rangeNd<int, 2>(r1, r2);
+    oneapi::tbb::blocked_nd_range<int, 2>(r1, r2);
 
-    oneapi::tbb::blocked_rangeNd<AbstractValueType, 4>({ MakeAbstractValue(-3), MakeAbstractValue(13), 8 },
+    int sizes[] = {174, 39, 2481, 93};
+    oneapi::tbb::blocked_nd_range<int, 4> rNd_1(sizes, /*grainsize*/7);
+
+    oneapi::tbb::blocked_nd_range<int, 4> rNd_2({174, 39, 2481, 93}, /*grainsize*/11);
+
+    for (unsigned i = 0; i < rNd_1.dim_count(); ++i) {
+        oneapi::tbb::blocked_nd_range<int, 4>::dim_range_type dim1 = rNd_1.dim(i);
+        oneapi::tbb::blocked_nd_range<int, 4>::dim_range_type dim2 = rNd_2.dim(i);
+        REQUIRE(dim1.begin()==0);
+        REQUIRE(dim2.begin()==0);
+        unsigned int szi = sizes[i]; // to compare with unsigned integrals without warnings
+        REQUIRE(dim1.size()==szi);
+        REQUIRE(dim2.size()==szi);
+        REQUIRE(dim1.grainsize()==7);
+        REQUIRE(dim2.grainsize()==11);
+    }
+
+    oneapi::tbb::blocked_nd_range<AbstractValueType, 4>({ MakeAbstractValue(-3), MakeAbstractValue(13), 8 },
                                                { MakeAbstractValue(-53), MakeAbstractValue(23), 2 },
                                                { MakeAbstractValue(-23), MakeAbstractValue(33), 1 },
                                                { MakeAbstractValue(-13), MakeAbstractValue(43), 7 });
@@ -239,14 +255,14 @@ TEST_CASE("Construction") {
 
 static const std::size_t N = 4;
 
-//! Testing blocked_rangeNd interface
+//! Testing blocked_nd_range interface
 //! \brief \ref interface \ref requirement
 TEST_CASE("Serial test") {
     SerialTest<N>();
 }
 
 #if !EMSCRIPTEN
-//! Testing blocked_rangeNd interface with parallel_for
+//! Testing blocked_nd_range interface with parallel_for
 //! \brief \ref requirement
 TEST_CASE("Parallel test") {
     for ( auto concurrency_level : utils::concurrency_range() ) {
@@ -256,13 +272,13 @@ TEST_CASE("Parallel test") {
 }
 #endif
 
-//! Testing blocked_rangeNd with proportional splitting
+//! Testing blocked_nd_range with proportional splitting
 //! \brief \ref interface \ref requirement
-TEST_CASE("blocked_rangeNd proportional splitting") {
-    oneapi::tbb::blocked_rangeNd<int, 2> original{{0, 100}, {0, 100}};
-    oneapi::tbb::blocked_rangeNd<int, 2> first(original);
+TEST_CASE("blocked_nd_range proportional splitting") {
+    oneapi::tbb::blocked_nd_range<int, 2> original{{0, 100}, {0, 100}};
+    oneapi::tbb::blocked_nd_range<int, 2> first(original);
     oneapi::tbb::proportional_split ps(3, 1);
-    oneapi::tbb::blocked_rangeNd<int, 2> second(first, ps);
+    oneapi::tbb::blocked_nd_range<int, 2> second(first, ps);
 
     int expected_first_end = static_cast<int>(
         original.dim(0).begin() + ps.left() * (original.dim(0).end() - original.dim(0).begin()) / (ps.left() + ps.right())
diff --git a/test/tbb/test_blocked_range.cpp b/test/tbb/test_blocked_range.cpp
index 651122220a..2443d590e1 100644
--- a/test/tbb/test_blocked_range.cpp
+++ b/test/tbb/test_blocked_range.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,11 +24,10 @@
 #include "tbb/blocked_range.h"
 #include "tbb/blocked_range2d.h"
 #include "tbb/blocked_range3d.h"
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#include "tbb/blocked_rangeNd.h"
+#include "tbb/blocked_nd_range.h"
 
 //! \file test_blocked_range.cpp
-//! \brief Test for [algorithms.blocked_range] specification
+//! \brief Test for [algorithms.blocked_range algorithms.blocked_range2d algorithms.blocked_range3d algorithms.blocked_nd_range] specification
 
 #include <utility> //for std::pair
 #include <functional>
@@ -120,12 +119,12 @@ template <typename... Types>
 void test_blocked_range3d_col_invalid_constraint() {}
 
 template <typename T>
-concept well_formed_blocked_range_Nd_instantiation_basic = requires {
-    typename tbb::blocked_rangeNd<T, 1>;
+concept well_formed_blocked_nd_range_instantiation_basic = requires {
+    typename tbb::blocked_nd_range<T, 1>;
 };
 
 template <typename... Types>
-concept well_formed_blocked_range_Nd_instantiation = ( ... && well_formed_blocked_range_Nd_instantiation_basic<Types> );
+concept well_formed_blocked_nd_range_instantiation = ( ... && well_formed_blocked_nd_range_instantiation_basic<Types> );
 
 //! \brief \ref error_guessing
 TEST_CASE("constraints for blocked_range value") {
@@ -180,13 +179,13 @@ TEST_CASE("constraints for blocked_range3d value") {
 }
 
 //! \brief \ref error_guessing
-TEST_CASE("constraints for blocked_rangeNd value") {
+TEST_CASE("constraints for blocked_nd_range value") {
     using namespace test_concepts::blocked_range_value;
     using const_iterator = typename std::vector<int>::const_iterator;
 
-    static_assert(well_formed_blocked_range_Nd_instantiation<Correct, char, int, std::size_t, const_iterator>);
+    static_assert(well_formed_blocked_nd_range_instantiation<Correct, char, int, std::size_t, const_iterator>);
 
-    static_assert(!well_formed_blocked_range_Nd_instantiation<NonCopyable, NonCopyAssignable, NonDestructible,
+    static_assert(!well_formed_blocked_nd_range_instantiation<NonCopyable, NonCopyAssignable, NonDestructible,
                                                               NoOperatorLess, OperatorLessNonConst, WrongReturnOperatorLess,
                                                               NoOperatorMinus, OperatorMinusNonConst, WrongReturnOperatorMinus,
                                                               NoOperatorPlus, OperatorPlusNonConst, WrongReturnOperatorPlus>);
diff --git a/test/tbb/test_broadcast_node.cpp b/test/tbb/test_broadcast_node.cpp
index b3905e6d60..662a08331d 100644
--- a/test/tbb/test_broadcast_node.cpp
+++ b/test/tbb/test_broadcast_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 //! \brief Test for [flow_graph.broadcast_node] specification
 
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 namespace tbb {
 using task = TBB_INTERNAL_NAMESPACE::graph_task;
 }
@@ -73,6 +73,12 @@ class counting_array_receiver : public tbb::flow::receiver<T> {
         return const_cast<tbb::task *>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    tbb::task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -241,6 +247,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for broadcast_node
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_spawning_and_serial_receiver() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items_unlimited, processed_items_serial;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        // Broadcast to 2 function_nodes, one with unlimited concurrency and the other serial
+        tbb::flow::function_node<int, int, tbb::flow::queueing> unlimited(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        broadcast.try_put(item);
+                    }
+                }
+                processed_items_unlimited.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, unlimited);
+
+        tbb::flow::function_node<int, int, tbb::flow::queueing> serial(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                processed_items_serial.emplace_back(input);
+                return 0;
+            });
+        tbb::flow::make_edge(broadcast, serial);
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        size_t unlimited_check_index = 0, serial_check_index = 0;
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items_unlimited.size() == new_work_items.size() + start_work_items.size(),
+                      "Unexpected number of processed items");
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == wait_message, "Unexpected items processing");
+        for (int i = int(new_work_items.size()) - 1; i >= 0; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == new_work_items[i], "Unexpected items processing");
+        }
+        for (int i = int(start_work_items.size()) - 1; i >= 1; --i) {
+            CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[i], "Unexpected items processing");
+        }
+
+        // Serial queueing function_node should add all start_work_items except the first one into the queue
+        // and then process them in FIFO order.
+        // wait_message would also be added to the queue, but would be processed later
+        CHECK_MESSAGE(processed_items_serial.size() == start_work_items.size() + 1,
+                      "Unexpected number of processed items");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK_MESSAGE(processed_items_serial[serial_check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items_unlimited[unlimited_check_index++] == start_work_items[0], "Unexpected items processing");
+
+        // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+        // They would be queued and processed later in FIFO order
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items_serial[serial_check_index++] == item, "Unexpected items processing");
+        }
+        CHECK(serial_check_index == processed_items_serial.size());
+        CHECK(unlimited_check_index == processed_items_unlimited.size());
+    });
+}
+
+void test_try_put_and_wait_spawning_receivers() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int wait_message = 10;
+        int num_successors = wait_message - 1;
+
+        std::vector<int> start_work_items;
+        std::vector<std::vector<int>> processed_items(num_successors);
+        std::vector<int> new_work_items;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::broadcast_node<int> broadcast(g);
+
+        std::vector<tbb::flow::function_node<int, int, tbb::flow::queueing>> successors;
+        successors.reserve(num_successors);
+        for (int i = 0; i < num_successors; ++i) {
+            successors.emplace_back(g, tbb::flow::unlimited,
+                [&, i](int input) noexcept {
+                    if (input == wait_message) {
+                        broadcast.try_put(new_work_items[i]);
+                    }
+                    processed_items[i].emplace_back(input);
+                    return 0;
+                });
+            tbb::flow::make_edge(broadcast, successors.back());
+        }
+
+        for (int i = 0; i < wait_message; ++i) {
+            broadcast.try_put(i);
+        }
+
+        broadcast.try_put_and_wait(wait_message);
+
+        for (int i = num_successors - 1; i >= 0; --i) {
+            size_t check_index = 0;
+            for (int j = num_successors - 1; j != i; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+            CHECK_MESSAGE(processed_items[i][check_index++] == wait_message, "Unexpected items processing");
+            for (int j = i; j >= 1; --j) {
+                CHECK_MESSAGE(processed_items[i][check_index++] == new_work_items[j], "Unexpected items processing");
+            }
+        }
+
+        g.wait_for_all();
+
+        for (auto& processed_item : processed_items) {
+            size_t check_index = num_successors;
+            CHECK_MESSAGE(processed_item[check_index++] == new_work_items[0], "Unexpected items processing");
+            for (int i = int(start_work_items.size()) - 1; i >= 0; --i) {
+                CHECK_MESSAGE(processed_item[check_index++] == start_work_items[i], "Unexpected items processing");
+            }
+        }
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_spawning_and_serial_receiver();
+    test_try_put_and_wait_spawning_receivers();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial broadcasts
 //! \brief \ref error_guessing
 TEST_CASE("Serial broadcasts"){
@@ -282,3 +448,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test broadcast_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_buffer_node.cpp b/test/tbb/test_buffer_node.cpp
index 89f4485b3d..527005aecb 100644
--- a/test/tbb/test_buffer_node.cpp
+++ b/test/tbb/test_buffer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,11 +24,11 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_buffer_node.cpp
 //! \brief Test for [flow_graph.buffer_node] specification
 
-
 #define N 1000
 #define C 10
 
@@ -307,7 +307,7 @@ int test_parallel(int num_threads) {
 // Chained buffers ( 2 & 3 ), single sender, items at last buffer in arbitrary order
 //
 
-#define TBB_INTERNAL_NAMESPACE detail::d1
+#define TBB_INTERNAL_NAMESPACE detail::d2
 using tbb::TBB_INTERNAL_NAMESPACE::register_predecessor;
 using tbb::TBB_INTERNAL_NAMESPACE::remove_predecessor;
 
@@ -455,6 +455,161 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_buffer_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::buffer_node<int>>(start_work_items, wait_message,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in LIFO order
+        //     4. wait_message would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process wait_message and add new_work_items to the buffer1
+        //     6. forward_task for new_work_items would be spawned, wait_message would be buffered in the buffer2
+        //     7. function task for next FIFO item in the queue would be spawned
+        //     8. forward_task for wait_message in buffer2 would be executed without spawning
+        //     9. writer task for wait_message would be executed without spawning and write wait_message to the buffer
+        //     10. try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { wait_message, start_work_items LIFO, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "try_put_and_wait should process only the wait_message");
+
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::buffer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in LIFO order
+        //     6. wait_message would be taken first and push new_work_items to the buffer
+        // Expected items processing { occupier, wait_message, new_work_items LIFO, start_work_items LIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == 2, "Only wait_message and occupier should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (std::size_t index = new_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                          "wait_for_all should process new_work_items LIFO");
+        }
+        for (std::size_t index = start_work_items.size(); index != 0; --index) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                          "wait_for_all should process start_work_items LIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::buffer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push wait_message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for wait_message processing
+            // 5. wait_message would be processed that would add new_work_items to the buffer
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // Since the reservation always accepts the front element of the buffer
+            // it is expected that the items would be taken from the buffer in FIFO order
+            // instead of LIFO on try_get for buffer_node
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == 1, "try_put_and_wait should process only wait_message");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 #include <iomanip>
 
 //! Test buffer_node with parallel and serial neighbours
@@ -489,8 +644,15 @@ TEST_CASE("Follows and precedes API"){
 
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Test deduction guides
-//! \brief requirement
+//! \brief \ref requirement
 TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test buffer_node try_put_and_wait") {
+    test_buffer_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_buffering_try_put_and_wait.h b/test/tbb/test_buffering_try_put_and_wait.h
new file mode 100644
index 0000000000..300521233f
--- /dev/null
+++ b/test/tbb/test_buffering_try_put_and_wait.h
@@ -0,0 +1,189 @@
+/*
+    Copyright (c) 2024 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef __TBB_test_tbb_buffering_try_put_and_wait_H
+#define __TBB_test_tbb_buffering_try_put_and_wait_H
+
+#include <oneapi/tbb/task_arena.h>
+#include <oneapi/tbb/flow_graph.h>
+
+#include <vector>
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
+namespace test_try_put_and_wait {
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_push(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    std::size_t after_try_put_and_wait_start_index = 0;
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int>;
+
+        BufferingNode buffer1(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        BufferingNode buffer2(g, args...);
+
+        function_node_type writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, function);
+        tbb::flow::make_edge(function, buffer2);
+        tbb::flow::make_edge(buffer2, writer);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+        }
+
+        buffer1.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_pull(const std::vector<int>& start_work_items,
+                             int wait_message,
+                             int occupier,
+                             const std::vector<int>& new_work_items,
+                             std::vector<int>& processed_items,
+                             Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+        BufferingNode buffer(g, args...);
+
+        function_node_type function(g, tbb::flow::serial,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        // Occupy the concurrency of function_node
+        // This call spawns the task to process the occupier
+        function.try_put(occupier);
+
+        // Make edge between buffer and function after occupying the concurrency
+        // To ensure that forward task of the buffer would be spawned after the occupier task
+        // And the function_node would reject the items from the buffer
+        // and process them later by calling try_get on the buffer
+        tbb::flow::make_edge(buffer, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+template <typename BufferingNode, typename... Args>
+std::size_t test_buffer_reserve(std::size_t limiter_threshold,
+                                const std::vector<int>& start_work_items,
+                                int wait_message,
+                                const std::vector<int>& new_work_items,
+                                std::vector<int>& processed_items,
+                                Args... args)
+{
+    tbb::task_arena arena(1);
+    std::size_t after_try_put_and_wait_start_index = 0;
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        BufferingNode buffer(g, args...);
+
+        tbb::flow::limiter_node<int, int> limiter(g, limiter_threshold);
+        tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer.try_put(item);
+                    }
+                }
+                // Explicitly put to the decrementer instead of making edge
+                // to guarantee that the next task would be spawned and not returned
+                // to the current thread as the next task
+                // Otherwise, all elements would be processed during the try_put_and_wait
+                limiter.decrementer().try_put(1);
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, limiter);
+        tbb::flow::make_edge(limiter, function);
+
+        for (auto item : start_work_items) {
+            buffer.try_put(item);
+        }
+
+        buffer.try_put_and_wait(wait_message);
+
+        after_try_put_and_wait_start_index = processed_items.size();
+
+        g.wait_for_all();
+    });
+
+    return after_try_put_and_wait_start_index;
+}
+
+} // test_try_put_and_wait
+
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+#endif // __TBB_test_tbb_buffering_try_put_and_wait_H
diff --git a/test/tbb/test_concurrent_vector.cpp b/test/tbb/test_concurrent_vector.cpp
index afb37ab49c..0237102e17 100644
--- a/test/tbb/test_concurrent_vector.cpp
+++ b/test/tbb/test_concurrent_vector.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include <tbb/parallel_for.h>
 #include <algorithm>
 #include <cmath>
+#include <random>
 
 //! \file test_concurrent_vector.cpp
 //! \brief Test for [containers.concurrent_vector] specification
@@ -692,14 +693,36 @@ TEST_CASE("swap with not always equal allocators"){
 // or fail with the assertion in debug mode.
 //! \brief \ref regression
 TEST_CASE("Testing vector in a highly concurrent environment") {
-    for (std::size_t i = 0; i < 10000; ++i) {
-        tbb::concurrent_vector<int> test_vec;
-
-        tbb::parallel_for(tbb::blocked_range<std::size_t>(0, 10000), [&] (const tbb::blocked_range<std::size_t>&) {
-            test_vec.grow_by(1);
-        }, tbb::static_partitioner{});
+    std::uniform_int_distribution<> uniform_dist(1, 32); // grow by from 1 to 32 randomly
+    std::mt19937_64 gen(/*seed*/1); // Constructing with seed to have reproducible results
+    constexpr int num_repeats = 10000, num_inserts = 256;
+    std::vector<int> grow_by_vals(num_inserts);
+
+    for (int i = 0; i < num_repeats; ++i) {
+        int expected_size = 0, expected_sum = 0;
+        std::generate(grow_by_vals.begin(), grow_by_vals.end(),
+                      [&gen, &uniform_dist, &expected_size, &expected_sum]() {
+                          const int random_value = uniform_dist(gen);
+                          expected_size += random_value;
+                          expected_sum += random_value * random_value;
+                          return random_value;
+                      });
 
-        REQUIRE(test_vec.size() == utils::get_platform_max_threads());
+        tbb::concurrent_vector<int> test_vec;
+        tbb::parallel_for(0, num_inserts, [&] (int j) {
+            tbb::concurrent_vector<int>::iterator start_it = test_vec.grow_by(grow_by_vals[j]);
+            tbb::concurrent_vector<int>::iterator end_it = start_it + grow_by_vals[j];
+            do {
+                *start_it = grow_by_vals[j];
+            } while (++start_it != end_it);
+        });
+
+        REQUIRE(test_vec.size() == expected_size);
+        int actual_sum = 0;
+        for (int j = 0; j < expected_size; ++j) {
+            actual_sum += test_vec[j];
+        }
+        REQUIRE(expected_sum == actual_sum);
     }
 }
 
diff --git a/test/tbb/test_continue_node.cpp b/test/tbb/test_continue_node.cpp
index 8c2c5c5bb9..1cfea3df43 100644
--- a/test/tbb/test_continue_node.cpp
+++ b/test/tbb/test_continue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ template< typename OutputType >
 void run_continue_nodes( int p, tbb::flow::graph& g, tbb::flow::continue_node< OutputType >& n ) {
     fake_continue_sender fake_sender;
     for (size_t i = 0; i < N; ++i) {
-        tbb::detail::d1::register_predecessor(n, fake_sender);
+        tbb::detail::d2::register_predecessor(n, fake_sender);
     }
 
     for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -138,7 +138,7 @@ void continue_nodes_with_copy( ) {
         tbb::flow::continue_node< OutputType > exe_node( g, cf );
         fake_continue_sender fake_sender;
         for (size_t i = 0; i < N; ++i) {
-            tbb::detail::d1::register_predecessor(exe_node, fake_sender);
+            tbb::detail::d2::register_predecessor(exe_node, fake_sender);
         }
 
         for (size_t num_receivers = 1; num_receivers <= MAX_NODES; ++num_receivers ) {
@@ -354,6 +354,176 @@ void test_successor_cache_specialization() {
                   "Wrong number of messages is passed via continue_node");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait_default() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        int processed_items = 0;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* start_node = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static bool put_ten_msgs = true;
+                if (put_ten_msgs) {
+                    for (std::size_t i = 0; i < 10; ++i) {
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                    put_ten_msgs = false;
+                }
+            });
+
+        start_node = &cont;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> writer(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                ++processed_items;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Only 1 item should be processed, with the additional 10 items having been spawned
+        CHECK_MESSAGE(processed_items == 1, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        // The additional 10 items should be processed
+        CHECK_MESSAGE(processed_items == 11, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait_lightweight() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight>* start_node = nullptr;
+
+        tbb::flow::continue_node<int, tbb::flow::lightweight> cont(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                static int counter = 0;
+                int i = counter++;
+                if (i == wait_message) {
+                    for (auto item : new_work_items) {
+                        (void)item;
+                        start_node->try_put(tbb::flow::continue_msg{});
+                    }
+                }
+                return i;
+            });
+
+        start_node = &cont;
+
+        tbb::flow::function_node<int, int, tbb::flow::lightweight> writer(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(cont, writer);
+
+        for (auto item : start_work_items) {
+            (void)item;
+            cont.try_put(tbb::flow::continue_msg{});
+        }
+
+        cont.try_put_and_wait(tbb::flow::continue_msg{});
+
+        CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                      "Unexpected number of elements processed");
+
+        std::size_t check_index = 0;
+
+        // For lightweight continue_node, start_work_items are expected to be processed first
+        // while putting items into the first node
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+        // wait_message would be processed only after new_work_items
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_metainfo_buffering() {
+    tbb::task_arena arena(1);
+
+    arena.execute([&] {
+        tbb::flow::graph g;
+
+        std::vector<char> call_order;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg>* b_ptr = nullptr;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> a(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('A');
+                static std::once_flag flag; // Send a signal to B only in the first call
+                std::call_once(flag, [&]{ b_ptr->try_put(tbb::flow::continue_msg{}); });
+            });
+
+        tbb::flow::continue_node<tbb::flow::continue_msg> b(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('B');
+                a.try_put(tbb::flow::continue_msg{});
+            });
+
+        b_ptr = &b;
+
+        tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight> c(g,
+            [&](tbb::flow::continue_msg) noexcept {
+                call_order.push_back('C');
+            });
+
+        tbb::flow::make_edge(a, c);
+        tbb::flow::make_edge(b, c);
+
+        a.try_put_and_wait(tbb::flow::continue_msg{});
+
+        // Inside the first call of A, we send a signal to B.
+        // Both of them send signals to C. Since C lightweight, it is processed immediately
+        // upon receiving signals from both predecessors. This completes the wait.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C'});
+
+        g.wait_for_all();
+
+        // B previously sent a signal to A, which has now been processed.
+        // A sends a signal to C, which is not processed because no signal is received from B this time.
+        CHECK(call_order == std::vector<char>{'A', 'B', 'C', 'A'});
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_default();
+    test_try_put_and_wait_lightweight();
+    test_metainfo_buffering();
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test concurrent continue_node for correctness
 //! \brief \ref error_guessing
 TEST_CASE("Concurrency testing") {
@@ -418,3 +588,10 @@ TEST_CASE("constraints for continue_node body") {
     static_assert(!can_call_continue_node_ctor<output_type, WrongReturnOperatorRoundBrackets<output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test continue_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_dynamic_link.cpp b/test/tbb/test_dynamic_link.cpp
index 2856db37c0..3372eb7239 100644
--- a/test/tbb/test_dynamic_link.cpp
+++ b/test/tbb/test_dynamic_link.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ enum FOO_TYPE {
 #if _WIN32 || _WIN64
 #define TEST_EXPORT
 #else
-#define TEST_EXPORT extern "C"
+#define TEST_EXPORT extern "C" __TBB_EXPORT
 #endif /* _WIN32 || _WIN64 */
 
 // foo "implementations".
diff --git a/test/tbb/test_flow_graph_whitebox.cpp b/test/tbb/test_flow_graph_whitebox.cpp
index a3ed03b252..88365d892d 100644
--- a/test/tbb/test_flow_graph_whitebox.cpp
+++ b/test/tbb/test_flow_graph_whitebox.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -459,7 +459,7 @@ template <>
 struct DecrementerHelper<tbb::flow::continue_msg> {
     template <typename Decrementer>
     static void check(Decrementer& decrementer) {
-        auto& d = static_cast<tbb::detail::d1::continue_receiver&>(decrementer);
+        auto& d = static_cast<tbb::detail::d2::continue_receiver&>(decrementer);
         CHECK_MESSAGE(d.my_predecessor_count == 0, "error in pred count");
         CHECK_MESSAGE(d.my_initial_predecessor_count == 0, "error in initial pred count");
         CHECK_MESSAGE(d.my_current_count == 0, "error in current count");
diff --git a/test/tbb/test_function_node.cpp b/test/tbb/test_function_node.cpp
index aa7e41ca59..999adac189 100644
--- a/test/tbb/test_function_node.cpp
+++ b/test/tbb/test_function_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -469,6 +469,261 @@ void test_follows_and_precedes_api() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+// Basic idea of the following tests is to check that try_put_and_wait(message) call for function_node
+// with one of the policies (lightweight, queueing and rejecting) with different concurrency limits
+// processes all of the previous jobs required to process message, the message itself, but does
+// not process the elements submitted later or not required to process the message
+// These tests submit start_work_items using the regular try_put and then submit wait_message
+// with try_put_and_wait. During the completion of the graph, new_work_items would be submitted
+// once the wait_message arrives.
+void test_try_put_and_wait_lightweight(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::lightweight>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        // For lightweight function_node, start_work_items are expected to be processed first
+        // while putting items into the first node.
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+        }
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // If the lightweight function_node is serial, it should process the wait_message but add items from new_work_items
+            // into the queue since the concurrency limit is occupied.
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        } else {
+            // If the node is unlimited, it should process new_work_items immediately while processing the wait_message
+            // Hence they should be processed before exiting the try_put_and_wait
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + new_work_items.size() + 1,
+                          "Unexpected number of elements processed");
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+            // wait_message would be processed only after new_work_items
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+        }
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For the serial node, processing of new_work_items would be postponed to wait_for_all since they
+            // would be queued and spawned after working with wait_message
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_queueing(std::size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::queueing>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        for (int i = 0; i < wait_message; ++i) {
+            function.try_put(i);
+        }
+
+        function.try_put_and_wait(wait_message);
+
+        std::size_t check_index = 0;
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // Serial queueing function_node should add all start_work_items except the first one into the queue
+            // and then process them in FIFO order.
+            // wait_message would also be added to the queue, but would be processed later
+            CHECK_MESSAGE(processed_items.size() == start_work_items.size() + 1, "Unexpected number of elements processed");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            CHECK_MESSAGE(processed_items.size() == 1, "Unexpected number of elements processed");
+        }
+
+        // For the unlimited function_node, all of the tasks for start_work_items and wait_message would be spawned
+        // and hence processed by the thread in LIFO order.
+        // The first processed item is expected to be wait_message since it was spawned last
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        if (concurrency_limit == tbb::flow::serial) {
+            // For serial queueing function_node, the new_work_items are expected to be processed while calling to wait_for_all
+            // They would be queued and processed later in FIFO order
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected items processing");
+            }
+        } else {
+            // Unlimited function_node would always spawn tasks immediately without adding them into the queue
+            // They would be processed in LIFO order. Hence it is expected that new_work_items would be processed first in reverse order
+            // After them, start_work_items would be processed also in reverse order
+            for (std::size_t i = new_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1], "Unexpected items processing");
+            }
+            for (std::size_t i = start_work_items.size(); i != 0; --i) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1], "Unexpected items processing");
+            }
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+
+void test_try_put_and_wait_rejecting(size_t concurrency_limit) {
+    tbb::task_arena arena(1);
+
+    arena.execute([&]{
+        tbb::flow::graph g;
+
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+
+        int wait_message = 0;
+
+        for (int i = 1; i < wait_message; ++i) {
+            new_work_items.emplace_back(i);
+        }
+
+        using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+        function_node_type* start_node = nullptr;
+
+        function_node_type function(g, concurrency_limit,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        start_node->try_put(item);
+                    }
+                }
+                return input;
+            });
+
+        start_node = &function;
+
+        function_node_type writer(g, concurrency_limit,
+            [&](int input) noexcept {
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(function, writer);
+
+        // If the first action is try_put_and_wait, it will occupy concurrency of the function_node
+        // All submits of new_work_items inside of the body should be rejected
+        bool result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(result, "task should not rejected since the node concurrency is not saturated");
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+        CHECK_MESSAGE(processed_items[0] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        CHECK_MESSAGE(processed_items.size() == 1, nullptr);
+
+        processed_items.clear();
+
+        // If the first action is try_put, try_put_and_wait is expected to return false since the concurrency of the
+        // node would be saturated
+        function.try_put(0);
+        result = function.try_put_and_wait(wait_message);
+        CHECK_MESSAGE(!result, "task should be rejected since the node concurrency is saturated");
+        CHECK(processed_items.empty());
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK_MESSAGE(processed_items[0] == 0, "Unexpected items processing");
+    });
+}
+
+void test_try_put_and_wait() {
+    test_try_put_and_wait_lightweight(tbb::flow::serial);
+    test_try_put_and_wait_lightweight(tbb::flow::unlimited);
+
+    test_try_put_and_wait_queueing(tbb::flow::serial);
+    test_try_put_and_wait_queueing(tbb::flow::unlimited);
+
+    test_try_put_and_wait_rejecting(tbb::flow::serial);
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
 
 //! Test various node bodies with concurrency
 //! \brief \ref error_guessing
@@ -544,3 +799,10 @@ TEST_CASE("constraints for function_node body") {
     static_assert(!can_call_function_node_ctor<input_type, output_type, WrongReturnRoundBrackets<input_type, output_type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test function_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_hw_concurrency.cpp b/test/tbb/test_hw_concurrency.cpp
index 16d4067b83..115a8f34be 100644
--- a/test/tbb/test_hw_concurrency.cpp
+++ b/test/tbb/test_hw_concurrency.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -47,8 +47,7 @@
 #include "tbb/blocked_range.h"
 #include "tbb/blocked_range2d.h"
 #include "tbb/blocked_range3d.h"
-#define TBB_PREVIEW_BLOCKED_RANGE_ND 1
-#include "tbb/blocked_rangeNd.h"
+#include "tbb/blocked_nd_range.h"
 
 // Declaration of global objects are needed to check that
 // it does not initialize the task scheduler, and in particular
@@ -87,7 +86,7 @@ tbb::tick_count test_tc;
 tbb::blocked_range<std::size_t> br(0, 1);
 tbb::blocked_range2d<std::size_t> br2d(0, 1, 0, 1);
 tbb::blocked_range3d<std::size_t> br3d(0, 1, 0, 1, 0, 1);
-tbb::blocked_rangeNd<std::size_t, 2> brNd({0, 1}, {0, 1});
+tbb::blocked_nd_range<std::size_t, 2> brNd({0, 1}, {0, 1});
 
 //! \brief \ref error_guessing
 TEST_CASE("Check absence of scheduler initialization") {
diff --git a/test/tbb/test_indexer_node.cpp b/test/tbb/test_indexer_node.cpp
index 4ce87e195a..c47a8cad01 100644
--- a/test/tbb/test_indexer_node.cpp
+++ b/test/tbb/test_indexer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -661,6 +661,81 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<float> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        tbb::flow::indexer_node<int, float> indexer(g);
+        using output_type = decltype(indexer)::output_type;
+
+        tbb::flow::function_node<output_type, int> function(g, tbb::flow::serial,
+            [&](output_type tag_msg) noexcept {
+                if (tag_msg.tag() == 0) {
+                    int input = tag_msg.cast_to<int>();
+                    if (input == wait_message) {
+                        for (auto item : new_work_items) {
+                            tbb::flow::input_port<0>(indexer).try_put(item);
+                            tbb::flow::input_port<1>(indexer).try_put(float(item));
+                        }
+                    }
+                    processed_items1.emplace_back(input);
+                } else {
+                    processed_items2.emplace_back(tag_msg.cast_to<float>());
+                }
+                return 0;
+            });
+
+        tbb::flow::make_edge(indexer, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(indexer).try_put(item);
+            tbb::flow::input_port<1>(indexer).try_put(float(item));
+        }
+
+        tbb::flow::input_port<0>(indexer).try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function node, all start_work_items would be stored in a queue
+        // wait_message would be stored at the end of the queue
+        // During the try_put_and_wait call, start_work_items would be processed from the queue in FIFO order
+        // wait_message would be processed last and adds new_work_items into the same queue
+        // It is expected then new_work_items would be processed during wait_for_all() call
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected items processing");
+        }
+
+        // wait_message was submitted only to the first port of indexer_node
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected wait_message processing");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == item, "Unexpected new_work_items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == float(item), "Unexpected new_work_items processing");
+        }
+        CHECK((check_index1 == processed_items1.size() && check_index2 == processed_items2.size()));
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Serial and parallel test on various tuple sizes
 //! \brief \ref error_guessing
 TEST_CASE("Serial and parallel test") {
@@ -712,3 +787,9 @@ TEST_CASE("Deduction guides") {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test indexer_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_input_node.cpp b/test/tbb/test_input_node.cpp
index f27bf71482..9442693980 100644
--- a/test/tbb/test_input_node.cpp
+++ b/test/tbb/test_input_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,8 +30,8 @@
 //! \brief Test for [flow_graph.input_node] specification
 
 
-using tbb::detail::d1::graph_task;
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
 
 const int N = 1000;
 
@@ -61,6 +61,12 @@ class test_push_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
         return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task* try_put_task( const T& v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
diff --git a/test/tbb/test_join_node.cpp b/test/tbb/test_join_node.cpp
index 2e3af3c547..7f1721e0ee 100644
--- a/test/tbb/test_join_node.cpp
+++ b/test/tbb/test_join_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -154,3 +154,40 @@ TEST_CASE("Test removal of the predecessor while having none") {
 
     test(connect_join_via_make_edge);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test reservation on the port") {
+    tbb::flow::graph g;
+
+    tbb::flow::buffer_node<int> buffer1(g), buffer2(g);
+    tbb::flow::join_node<std::tuple<int, int>, tbb::flow::reserving> join(g);
+    tbb::flow::buffer_node<std::tuple<int, int>> buffer3(g);
+
+    auto& port0 = tbb::flow::input_port<0>(join);
+    auto& port1 = tbb::flow::input_port<1>(join);
+
+    tbb::flow::make_edge(buffer1, port0);
+    tbb::flow::make_edge(buffer2, port1);
+    tbb::flow::make_edge(join, buffer3);
+
+    int value = -42;
+    bool result = port0.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(!result, "Incorrect reserve return value");
+
+    buffer1.try_put(1);
+    g.wait_for_all();
+
+    result = port0.reserve(value);
+    CHECK_MESSAGE(result, "Incorrect reserve return value");
+    CHECK_MESSAGE(value == 1, "Incorrect reserved value");
+    port0.release();
+
+    buffer2.try_put(2);
+    g.wait_for_all();
+
+    result = port1.reserve(value);
+    CHECK_MESSAGE(result, "incorrect reserve return value");
+}
diff --git a/test/tbb/test_join_node.h b/test/tbb/test_join_node.h
index 8bb12bad51..2216310c1a 100644
--- a/test/tbb/test_join_node.h
+++ b/test/tbb/test_join_node.h
@@ -245,10 +245,10 @@ struct my_struct_key<K&, V> {
     }
 };
 
-using tbb::detail::d1::type_to_key_function_body;
-using tbb::detail::d1::hash_buffer;
+using tbb::detail::d2::type_to_key_function_body;
+using tbb::detail::d2::type_to_key_function_body_leaf;
+using tbb::detail::d2::hash_buffer;
 using tbb::detail::d1::tbb_hash_compare;
-using tbb::detail::d1::type_to_key_function_body_leaf;
 
 template<class K, class V> struct VtoKFB {
     typedef type_to_key_function_body<V, K> type;
diff --git a/test/tbb/test_join_node_preview.cpp b/test/tbb/test_join_node_preview.cpp
index 4bcb1900d6..3ee4075794 100644
--- a/test/tbb/test_join_node_preview.cpp
+++ b/test/tbb/test_join_node_preview.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2023 Intel Corporation
+    Copyright (c) 2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -82,6 +82,249 @@ void test_follows_and_precedes_api() {
     jn_msg_key_matching_follows_and_precedes();
 }
 
+void test_try_put_and_wait_queueing() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::join_node<tuple_type, tbb::flow::queueing> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<0>(join).try_put(item);
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in FIFO order
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+void test_try_put_and_wait_reserving() {
+    tbb::task_arena arena(1);
+
+    arena.execute([]{
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int, int>;
+        tbb::flow::queue_node<int> buffer1(g);
+        tbb::flow::queue_node<int> buffer2(g);
+        tbb::flow::queue_node<int> buffer3(g);
+
+        tbb::flow::join_node<tuple_type, tbb::flow::reserving> join(g);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        buffer1.try_put(item);
+                        buffer2.try_put(item);
+                        buffer3.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer1, tbb::flow::input_port<0>(join));
+        tbb::flow::make_edge(buffer2, tbb::flow::input_port<1>(join));
+        tbb::flow::make_edge(buffer3, tbb::flow::input_port<2>(join));
+        tbb::flow::make_edge(join, function);
+
+        for (auto item : start_work_items) {
+            buffer1.try_put(item);
+            buffer2.try_put(item);
+            buffer3.try_put(item);
+        }
+
+        buffer1.try_put(wait_message);
+        buffer2.try_put(wait_message);
+        buffer3.try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // between function and join_node and between join_node and each buffer in FIFO order because queue_node is used
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+    });
+}
+
+struct int_wrapper {
+    int i = 0;
+    int_wrapper() : i(0) {}
+    int_wrapper(int ii) : i(ii) {}
+    int_wrapper& operator=(int ii) {
+        i = ii;
+        return *this;
+    }
+
+    int key() const {
+        return i;
+    }
+
+    friend bool operator==(const int_wrapper& lhs, const int_wrapper& rhs) {
+        return lhs.i == rhs.i;
+    }
+};
+
+template <typename... Body>
+void test_try_put_and_wait_key_matching(Body... body) {
+    // Body of one argument for testing standard key_matching
+    // Body of zero arguments for testing message based key_matching
+    static_assert(sizeof...(Body) == 0 || sizeof...(Body) == 1, "incorrect test setup");
+    tbb::task_arena arena(1);
+
+    arena.execute([=] {
+        tbb::flow::graph g;
+
+        std::vector<int_wrapper> start_work_items;
+        std::vector<int_wrapper> processed_items;
+        std::vector<int_wrapper> new_work_items;
+        int_wrapper wait_message = 10;
+
+        for (int i = 0; i < wait_message.i; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message.i);
+        }
+
+        using tuple_type = std::tuple<int_wrapper, int_wrapper, int_wrapper>;
+        tbb::flow::join_node<tuple_type, tbb::flow::key_matching<int>> join(g, body..., body..., body...);
+
+        tbb::flow::function_node<tuple_type, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+            [&](tuple_type tuple) noexcept {
+                CHECK(std::get<0>(tuple) == std::get<1>(tuple));
+                CHECK(std::get<1>(tuple) == std::get<2>(tuple));
+
+                auto input = std::get<0>(tuple);
+
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        tbb::flow::input_port<0>(join).try_put(item);
+                        tbb::flow::input_port<1>(join).try_put(item);
+                        tbb::flow::input_port<2>(join).try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(join, function);
+
+        tbb::flow::input_port<0>(join).try_put(wait_message);
+        tbb::flow::input_port<1>(join).try_put(wait_message);
+
+        // For the first port - submit items in reversed order
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            tbb::flow::input_port<0>(join).try_put(start_work_items[i - 1]);
+        }
+
+        // For first two ports - submit items in direct order
+        for (auto item : start_work_items) {
+            tbb::flow::input_port<1>(join).try_put(item);
+            tbb::flow::input_port<2>(join).try_put(item);
+        }
+
+        tbb::flow::input_port<2>(join).try_put_and_wait(wait_message);
+
+        // It is expected that the join_node would push the tuple of three copies of first element in start_work_items
+        // And occupy the concurrency of function. Other tuples would be rejected and taken using push-pull protocol
+        // in order of submission
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+    });
+}
+
 //! Test follows and precedes API
 //! \brief \ref error_guessing
 TEST_CASE("Test follows and precedes API"){
@@ -101,3 +344,13 @@ TEST_CASE("Test removal of the predecessor while having none") {
     test(connect_join_via_follows);
     test(connect_join_via_precedes);
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Test join_node try_put_and_wait") {
+    test_try_put_and_wait_queueing();
+    test_try_put_and_wait_reserving();
+    // Test standard key_matching policy
+    test_try_put_and_wait_key_matching([](int_wrapper w) { return w.i; });
+    // Test msg based key_matching policy
+    test_try_put_and_wait_key_matching();
+}
diff --git a/test/tbb/test_limiter_node.cpp b/test/tbb/test_limiter_node.cpp
index 897f840d36..0bf4912f8a 100644
--- a/test/tbb/test_limiter_node.cpp
+++ b/test/tbb/test_limiter_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -38,8 +38,8 @@
 const int L = 10;
 const int N = 1000;
 
-using tbb::detail::d1::SUCCESSFULLY_ENQUEUED;
-using tbb::detail::d1::graph_task;
+using tbb::detail::d2::SUCCESSFULLY_ENQUEUED;
+using tbb::detail::d2::graph_task;
 
 template< typename T >
 struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
@@ -53,6 +53,12 @@ struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
    }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -71,6 +77,12 @@ struct parallel_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
        return const_cast<graph_task*>(SUCCESSFULLY_ENQUEUED);
     }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+    graph_task * try_put_task( const T &v, const tbb::detail::d2::message_metainfo& ) override {
+        return try_put_task(v);
+    }
+#endif
+
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
@@ -534,6 +546,67 @@ void test_decrement_while_try_put_task() {
     CHECK_MESSAGE(processed.load() == threshold, "decrementer terminate flow graph work");
 }
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        std::size_t threshold = start_work_items.size() + 1;
+        CHECK_MESSAGE(new_work_items.size() < threshold, "Incorrect test setup");
+
+        tbb::flow::limiter_node<int> limiter(g, threshold);
+        tbb::flow::function_node<int, tbb::flow::continue_msg> function(g, tbb::flow::serial,
+            [&](int input) {
+                if (input == wait_message) {
+                    for (auto item : new_work_items) {
+                        limiter.try_put(item);
+                    }
+                }
+                processed_items.emplace_back(input);
+            });
+
+        tbb::flow::make_edge(limiter, function);
+        tbb::flow::make_edge(function, limiter.decrementer());
+
+        for (auto item : start_work_items) {
+            limiter.try_put(item);
+        }
+
+        limiter.try_put_and_wait(wait_message);
+
+        // Since function is a serial queueing function_node, all start_work_items would be added to the queue
+        // and processed in FIFO order. wait_message would be added and processed last. Each item in start_work_items
+        // should put an item to a decrementer edge and hence new_work_items should not be missed as well
+
+        std::size_t check_index = 0;
+
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+        CHECK_MESSAGE(check_index == processed_items.size(), "Unexpected number of messages");
+
+        g.wait_for_all();
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected new_work_items processing");
+        }
+        CHECK(check_index == processed_items.size());
+    });
+}
+#endif
 
 //! Test puts on limiter_node with decrements and varying parallelism levels
 //! \brief \ref error_guessing
@@ -623,3 +696,10 @@ TEST_CASE("Test correct node deallocation while using small_object_pool") {
     tbb::task_scheduler_handle handle{ tbb::attach{} };
     tbb::finalize( handle, std::nothrow );
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test limiter_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_overwrite_node.cpp b/test/tbb/test_overwrite_node.cpp
index 127cca2d15..3f5ed8fec0 100644
--- a/test/tbb/test_overwrite_node.cpp
+++ b/test/tbb/test_overwrite_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "common/graph_utils.h"
 #include "common/test_follows_and_precedes_api.h"
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_overwrite_node.cpp
 //! \brief Test for [flow_graph.overwrite_node] specification
@@ -183,6 +184,165 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_overwrite_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    {
+        std::vector<int> processed_items;
+
+        // Returns the index from which wait_for_all processing started
+        std::size_t after_start = test_buffer_push<tbb::flow::overwrite_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items);
+
+        // It is expected that try_put_and_wait would process start_work_items (FIFO) and the wait_message
+        // and new_work_items (FIFO) would be processed in wait_for_all
+
+        CHECK_MESSAGE(after_start - 1 == start_work_items.size() + 1,
+                      "incorrect number of items processed by try_put_and_wait");
+        std::size_t check_index = 0;
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected start_work_items processing");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item, "unexpected new_work_items processing");
+        }
+    }
+    // Test pull
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            std::vector<int> processed_items;
+
+            tbb::flow::graph g;
+            tbb::flow::overwrite_node<int> buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            int start_message = 0;
+            int new_message = 1;
+
+            using function_node_type = tbb::flow::function_node<int, int, tbb::flow::rejecting>;
+
+            function_node_type function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        buffer.try_put(new_message);
+                    }
+
+                    // Explicitly clean the buffer to prevent infinite try_get by the function_node
+                    if (input == new_message) {
+                        buffer.clear();
+                    }
+
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            buffer.try_put(start_message); // Occupies concurrency of function
+
+            buffer.try_put_and_wait(wait_message);
+
+            CHECK_MESSAGE(processed_items.size() == 2, "only the start_message and wait_message should be processed");
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items[check_index++] == start_message, "unexpected start_message processing");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "unexpected wait_message processing");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message, "unexpected new_message processing");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test explicit clear
+    {
+        tbb::flow::graph g;
+        tbb::flow::overwrite_node<int> buffer(g);
+
+        std::vector<int> processed_items;
+
+        tbb::flow::function_node<int, int> f(g, tbb::flow::serial,
+            [&](int input) {
+                processed_items.emplace_back(input);
+                buffer.clear();
+                return 0;
+            });
+
+        tbb::flow::make_edge(buffer, f);
+
+        buffer.try_put_and_wait(wait_message);
+
+        CHECK_MESSAGE(processed_items.size() == 1, "Incorrect number of processed items");
+        CHECK_MESSAGE(processed_items.back() == wait_message, "unexpected processing");
+
+        g.wait_for_all();
+
+        CHECK(processed_items.size() == 1);
+        CHECK(processed_items.back() == wait_message);
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write"){
@@ -256,3 +416,10 @@ TEST_CASE("Cancel register_predecessor_task") {
     // Wait for cancellation of spawned tasks
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test overwrite_node try_put_and_wait") {
+    test_overwrite_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_priority_queue_node.cpp b/test/tbb/test_priority_queue_node.cpp
index d14aa4bbb3..18a60eb935 100644
--- a/test/tbb/test_priority_queue_node.cpp
+++ b/test/tbb/test_priority_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_priority_queue_node.cpp
 //! \brief Test for [flow_graph.priority_queue_node] specification
@@ -378,6 +379,166 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_pqueue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = -10;
+
+    for (int i = 0; i < 10; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer start_work_items into the function_node in LIFO order
+        //     4. wait_message would be transferred last because of lowest priority
+        //     5. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     6. function would process start_work_items and push them to the buffer2
+        //     7. wait_message would be processed last and add new_work_items to buffer1
+        //     8. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     9.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in LIFO order
+        // Expected items processing - { start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::priority_queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                                        new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in the priority (LIFO)  order
+        //     6. wait_message would be taken last due to lowest priority
+        //     7. new_work_items would be pushed to the buffer while processing wait_message
+        // During wait_for_all()
+        //     8. new_work_items would be taken from the buffer in the priority (LIFO) order
+        // Expected items processing { occupier, start_work_items LIFO, wait_message, new_work_items LIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "try_put_and_wait should process start_work_items, occupier and the wait_message");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "try_put_and_wait should process the occupier");
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == start_work_items[i - 1],
+                          "try_put_and_wait should process start_work_items in LIFO order");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items[check_index++] == new_work_items[i - 1],
+                          "wait_for_all should process new_work_items in LIFO order");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::priority_queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            //    in the priority (greatest first) order
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (std::size_t index = start_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == start_work_items[index - 1],
+                              "Unexpected start_work_items processing");
+            }
+
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (std::size_t index = new_work_items.size(); index != 0; --index) {
+                CHECK_MESSAGE(processed_items[check_index++] == new_work_items[index - 1],
+                              "Unexpected new_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial, parallel and reservation tests"){
@@ -419,3 +580,9 @@ TEST_CASE("Test deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test priority_queue_node try_put_and_wait") {
+    test_pqueue_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_queue_node.cpp b/test/tbb/test_queue_node.cpp
index e034ef6645..546b47edae 100644
--- a/test/tbb/test_queue_node.cpp
+++ b/test/tbb/test_queue_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include <cstdio>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_queue_node.cpp
 //! \brief Test for [flow_graph.queue_node] specification
@@ -494,6 +495,162 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_queue_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::queue_node<int>>(start_work_items, wait_message,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in FIFO order
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in FIFO order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::queue_node<int>>(start_work_items, wait_message, occupier,
+                                                                               new_work_items, processed_items);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::queue_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the queue, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                          "try_put_and_wait should start_work_items and wait_message");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected wait_message processing");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item, "Unexpected start_work_items processing");
+            }
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test serial, parallel behavior and reservation under parallelism
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Parallel, serial test"){
@@ -559,3 +716,10 @@ TEST_CASE("queue_node with reservation"){
     CHECK_MESSAGE((out_arg == -1), "Getting from reserved node should not update its argument.");
     g.wait_for_all();
 }
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test queue_node try_put_and_wait") {
+    test_queue_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_sequencer_node.cpp b/test/tbb/test_sequencer_node.cpp
index 564721f682..1e6494d69b 100644
--- a/test/tbb/test_sequencer_node.cpp
+++ b/test/tbb/test_sequencer_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <cstdio>
 #include <atomic>
 
+#include "test_buffering_try_put_and_wait.h"
 
 //! \file test_sequencer_node.cpp
 //! \brief Test for [flow_graph.sequencer_node] specification
@@ -437,6 +438,169 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_seq_node_try_put_and_wait() {
+    using namespace test_try_put_and_wait;
+
+    std::vector<int> start_work_items;
+    std::vector<int> new_work_items;
+    int wait_message = 10;
+
+    for (int i = 0; i < wait_message; ++i) {
+        start_work_items.emplace_back(i);
+        new_work_items.emplace_back(i + 1 + wait_message);
+    }
+
+    auto simple_sequencer = [](int item) { return item; };
+
+    // Test push
+    // test_buffer_push tests the graph
+    // buffer1 -> function -> buffer2 -> writer
+    //     function is a queueing serial function_node that submits new_work_items once wait_message arrives
+    //     writer is an unlimited function_node that writes an item into the processed_items vector
+    // Test steps
+    //     1. push start_work_items into the buffer1
+    //     2. buffer1.try_put_and_wait(wait_message);
+    //     3. g.wait_for_all()
+    // test_buffer_push returns the index from which the items processed during wait_for_all() starts
+    {
+        std::vector<int> processed_items;
+
+        std::size_t after_start = test_buffer_push<tbb::flow::sequencer_node<int>>(start_work_items, wait_message,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect:
+        // During buffer1.try_put_and_wait()
+        //     1. start_work_items would be pushed to buffer1
+        //     2. wait_message would be pushed to buffer1
+        //     3. forward_task on buffer1 would transfer all of the items to the function_node in sequencer order (FIFO)
+        //     4. the first item would occupy concurrency of function, other items would be pushed to the queue
+        //     5. function would process start_work_items and push them to the buffer2
+        //     6. wait_message would be processed last and add new_work_items to buffer1
+        //     7. forward_task on buffer2 would transfer start_work_items in sequencer (FIFO) order and the wait_message to the writer
+        //     8.  try_put_and_wait exits since wait_message is completed
+        // During g.wait_for_all()
+        //     10. forward_task for new_work_items in buffer1 would be spawned and put items in function in FIFO order
+        //     11. function_node would process and push forward items from the queue in FIFO order
+        // Expected items processing - { start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+        CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "try_put_and_wait should process start_work_items and the wait_message");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                      "try_put_and_wait should process wait_message after start_work_items");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "wait_for_all should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    } // Test push
+
+    // Test pull
+    // test_buffer_pull tests the graph
+    // buffer -> function
+    //     function is a rejecting serial function_node that submits new_work_items once wait_message arrives
+    //     and writes the processed item into the processed_items
+    // Test steps
+    //     1. push the occupier message to the function
+    //     2. push start_work_items into the buffer
+    //     3. buffer.try_put_and_wait(wait_message)
+    //     4. g.wait_for_all()
+    // test_buffer_pull returns the index from which the items processed during wait_for_all() starts
+
+    {
+        std::vector<int> processed_items;
+        int occupier = 42;
+
+        std::size_t after_start = test_buffer_pull<tbb::flow::sequencer_node<int>>(start_work_items, wait_message, occupier,
+                                                                                   new_work_items, processed_items,
+                                                                                   simple_sequencer);
+
+        // Expected effect
+        // 0. task for occupier processing would be spawned by the function
+        // During buffer.try_put_and_wait()
+        //     1. start_work_items would be pushed to the buffer
+        //     2. wait_message would be pushed to the buffer
+        //     3. forward_task would try to push items to the function, but would fail
+        //        and set the edge to the pull state
+        //     4. occupier would be processed
+        //     5. items would be taken from the buffer by function in FIFO order
+        //     6. wait_message would be taken last and push new_work_items to the buffer
+        // During wait_for_all()
+        //     7. new_work_items would be taken from the buffer in FIFO order
+        // Expected items processing { occupier, start_work_items FIFO, wait_message, new_work_items FIFO }
+
+        std::size_t check_index = 0;
+
+        CHECK_MESSAGE(after_start == start_work_items.size() + 2,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+        CHECK_MESSAGE(processed_items[check_index++] == occupier, "Unexpected items processing by try_put_and_wait");
+        for (auto item : start_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process start_work_items FIFO");
+        }
+        CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+        for (auto item : new_work_items) {
+            CHECK_MESSAGE(processed_items[check_index++] == item,
+                          "try_put_and_wait should process new_work_items FIFO");
+        }
+        CHECK(check_index == processed_items.size());
+    }
+
+    // Test reserve
+    {
+        int thresholds[] = { 1, 2 };
+
+        for (int threshold : thresholds) {
+            std::vector<int> processed_items;
+
+            // test_buffer_reserve tests the following graph
+            // buffer -> limiter -> function
+            //  function is a rejecting serial function_node that puts an item to the decrementer port
+            //  of the limiter inside of the body
+
+            std::size_t after_start = test_buffer_reserve<tbb::flow::sequencer_node<int>>(threshold,
+                start_work_items, wait_message, new_work_items, processed_items, simple_sequencer);
+
+            // Expected effect:
+            // 1. start_work_items would be pushed to the buffer
+            // 2. wait_message_would be pushed to the buffer
+            // 3. forward task of the buffer would push the first message to the limiter node.
+            //    Since the limiter threshold is not reached, it would be directly passed to the function
+            // 4. function would spawn the task for the first message processing
+            // 5. the first would be processed
+            // 6. decrementer.try_put() would be called and the limiter node would
+            //    process all of the items from the buffer using the try_reserve/try_consume/try_release semantics
+            // 7. When the wait_message would be taken from the buffer, the try_put_and_wait would exit
+
+            std::size_t check_index = 0;
+
+            CHECK_MESSAGE(after_start == start_work_items.size() + 1,
+                      "start_work_items, occupier and wait_message should be processed by try_put_and_wait");
+            for (auto item : start_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process start_work_items FIFO");
+            }
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Unexpected items processing by try_put_and_wait");
+
+            for (auto item : new_work_items) {
+                CHECK_MESSAGE(processed_items[check_index++] == item,
+                            "try_put_and_wait should process new_work_items FIFO");
+            }
+            CHECK(check_index == processed_items.size());
+        }
+    }
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test sequencer with various request orders and parallelism levels
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Serial and parallel test"){
@@ -501,3 +665,10 @@ TEST_CASE("constraints for sequencer_node sequencer") {
     static_assert(!can_call_sequencer_node_ctor<type, WrongReturnOperatorRoundBrackets<type>>);
 }
 #endif // __TBB_CPP20_CONCEPTS_PRESENT
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test sequencer_node try_put_and_wait") {
+    test_seq_node_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_split_node.cpp b/test/tbb/test_split_node.cpp
index e791b546b5..1e03be0dab 100644
--- a/test/tbb/test_split_node.cpp
+++ b/test/tbb/test_split_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -397,6 +397,83 @@ void test_deduction_guides() {
 
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    tbb::task_arena arena(1);
+
+    arena.execute([] {
+        tbb::flow::graph g;
+
+        std::vector<int> start_work_items;
+        std::vector<int> processed_items1;
+        std::vector<int> processed_items2;
+        std::vector<int> new_work_items;
+        int wait_message = 10;
+
+        for (int i = 0; i < wait_message; ++i) {
+            start_work_items.emplace_back(i);
+            new_work_items.emplace_back(i + 1 + wait_message);
+        }
+
+        using tuple_type = std::tuple<int, int>;
+        tbb::flow::split_node<tuple_type> split(g);
+
+        tbb::flow::function_node<int, int> function1(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                if (input == wait_message) {
+                    for (int item : new_work_items) {
+                        split.try_put(tuple_type{item, item});
+                    }
+                }
+                processed_items1.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::function_node<int, int> function2(g, tbb::flow::unlimited,
+            [&](int input) noexcept {
+                processed_items2.emplace_back(input);
+                return 0;
+            });
+
+        tbb::flow::make_edge(tbb::flow::output_port<0>(split), function1);
+        tbb::flow::make_edge(tbb::flow::output_port<1>(split), function2);
+
+        for (int i = 0; i < wait_message; ++i) {
+            split.try_put(tuple_type{i, i});
+        }
+
+        split.try_put_and_wait(tuple_type{wait_message, wait_message});
+
+        std::size_t check_index1 = 0;
+        std::size_t check_index2 = 0;
+
+        // Since split node broadcasts items to successors from last to first, start_work_items tasks and wait_message would be spawned
+        // in the following order {f2 - 1} - {f1 - 1} {f2 - 2} {f1 - 2} ... {f2 - 10}{f1 - 10}
+        // and processed in reversed order
+        // Hence {f1 - wait_message} task would be processed first and it would spawn tasks for new_work_items in the same order
+        // Since new_work_items tasks would processed first and {f2 - 10} would be still in queue
+        // it is expected that during the try_put_and_wait {f1 - 10} would be processed first, then new_work_items would be processed
+        // and only when {f2 - 10} would be taken and executed, try_put_and_wait would be exitted
+        // All of the other tasks for start_work_items would be processed during wait_for_all()
+        CHECK_MESSAGE(processed_items1[check_index1++] == wait_message, "Unexpected items processing");
+
+        for (std::size_t i = new_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == new_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == new_work_items[i - 1], "Unexpected items processing");
+        }
+
+        CHECK_MESSAGE(processed_items2[check_index2++] == wait_message, "Unexpected items processing");
+
+        g.wait_for_all();
+
+        for (std::size_t i = start_work_items.size(); i != 0; --i) {
+            CHECK_MESSAGE(processed_items1[check_index1++] == start_work_items[i - 1], "Unexpected items processing");
+            CHECK_MESSAGE(processed_items2[check_index2++] == start_work_items[i - 1], "Unexpected items processing");
+        }
+    });
+}
+#endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+
 //! Test output ports and message passing with different input tuples
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Tuple tests"){
@@ -446,3 +523,9 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test split_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbb/test_tagged_msg.cpp b/test/tbb/test_tagged_msg.cpp
index 656f0d3e89..520ecda9c2 100644
--- a/test/tbb/test_tagged_msg.cpp
+++ b/test/tbb/test_tagged_msg.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ typedef tbb::flow::tagged_msg<size_t, int, char, double, odd_array_type, odder_a
 
 // test base of tagged_msg
 void TestWrapper() {
-    using tbb::detail::d1::Wrapper;
+    using tbb::detail::d2::Wrapper;
     Wrapper<int> wi(42);
     Wrapper<int> wic(23);
 
diff --git a/test/tbb/test_task.cpp b/test/tbb/test_task.cpp
index 927c841033..6c2060a69a 100644
--- a/test/tbb/test_task.cpp
+++ b/test/tbb/test_task.cpp
@@ -861,3 +861,44 @@ TEST_CASE("Try to force Leaked proxy observers warning") {
         });
     });
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("Force thread limit on per-thread reference_vertex") {
+    int num_threads = std::thread::hardware_concurrency();
+    int num_groups = 1000;
+
+    // Force thread limit on per-thread reference_vertex
+    std::vector<tbb::task_group> groups(num_groups);
+    tbb::parallel_for(0, num_threads, [&] (int) {
+        std::vector<tbb::task_group> local_groups(num_groups);
+        for (int i = 0; i < num_groups; ++i) {
+            groups[i].run([] {});
+            local_groups[i].run([] {});
+            local_groups[i].wait();
+        }
+    }, tbb::static_partitioner{});
+
+    // Enforce extra reference on each task_group
+    std::deque<tbb::task_handle> handles{};
+    for (int i = 0; i < num_groups; ++i) {
+        handles.emplace_back(groups[i].defer([] {}));
+    }
+
+    // Check correctness of the execution
+    tbb::task_group group;
+
+    std::atomic<int> final_sum{};
+    for (int i = 0; i < num_groups; ++i) {
+        group.run([&] { ++final_sum; });
+    }
+    group.wait();
+    REQUIRE_MESSAGE(final_sum == num_groups, "Some tasks were not executed");
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].run(std::move(handles[i]));
+    }
+
+    for (int i = 0; i < num_groups; ++i) {
+        groups[i].wait();
+    }
+}
diff --git a/test/tbb/test_tbb_header.cpp b/test/tbb/test_tbb_header.cpp
index a671de1c2d..122af5ec08 100644
--- a/test/tbb/test_tbb_header.cpp
+++ b/test/tbb/test_tbb_header.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2023 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -165,13 +165,13 @@ static void TestExceptionClassesExports () {
     TestExceptionClassExports( std::out_of_range("test"), tbb::detail::exception_id::invalid_key );
     TestExceptionClassExports( tbb::user_abort(), tbb::detail::exception_id::user_abort );
     TestExceptionClassExports( std::runtime_error("test"), tbb::detail::exception_id::bad_tagged_msg_cast );
+    TestExceptionClassExports( tbb::unsafe_wait("test"), tbb::detail::exception_id::unsafe_wait );
 }
 
 #if __TBB_CPF_BUILD
 // These names are only tested in "preview" configuration
 // When a feature becomes fully supported, its names should be moved to the main test
 static void TestPreviewNames() {
-    TestTypeDefinitionPresence2( blocked_rangeNd<int,4> );
     TestTypeDefinitionPresence2( concurrent_lru_cache<int, int> );
     TestTypeDefinitionPresence( isolated_task_group );
 }
@@ -237,6 +237,7 @@ static void DefinitionPresence() {
     TestTypeDefinitionPresence( blocked_range<int> );
     TestTypeDefinitionPresence( blocked_range2d<int> );
     TestTypeDefinitionPresence( blocked_range3d<int> );
+    TestTypeDefinitionPresence2( blocked_nd_range<int,4> );
     TestTypeDefinitionPresence( collaborative_once_flag );
     TestFuncDefinitionPresence( collaborative_call_once, (tbb::collaborative_once_flag&, const Body&), void );
     TestFuncDefinitionPresence( parallel_invoke, (const Body&, const Body&, const Body&), void );
diff --git a/test/tbb/test_write_once_node.cpp b/test/tbb/test_write_once_node.cpp
index 2bb16383f8..6fb716bab0 100644
--- a/test/tbb/test_write_once_node.cpp
+++ b/test/tbb/test_write_once_node.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2021 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -207,6 +207,135 @@ void test_deduction_guides() {
 }
 #endif
 
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+void test_try_put_and_wait() {
+    int wait_message = 0;
+    int occupy_concurrency_message = 1;
+    int new_message = 2;
+
+    // Test push
+    {
+        tbb::task_arena arena(1);
+
+        std::vector<int> processed_items;
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == wait_message) {
+                        wo_buffer.clear();
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 1, "Only the wait_message should be processed");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message, "Only the wait_message should be processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test pull
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, function);
+
+            function.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+    // Test reserve
+    {
+        std::vector<int> processed_items;
+        tbb::task_arena arena(1);
+
+        arena.execute([&] {
+            tbb::flow::graph g;
+
+            tbb::flow::write_once_node<int> wo_buffer(g);
+            tbb::flow::limiter_node<int, int> limiter(g, 1);
+            tbb::flow::function_node<int, int, tbb::flow::rejecting> function(g, tbb::flow::serial,
+                [&](int input) {
+                    if (input == new_message || input == wait_message) {
+                        wo_buffer.clear();
+                    }
+
+                    if (input == wait_message) {
+                        wo_buffer.try_put(new_message);
+                    }
+                    processed_items.emplace_back(input);
+                    limiter.decrementer().try_put(1);
+                    return 0;
+                });
+
+            tbb::flow::make_edge(wo_buffer, limiter);
+            tbb::flow::make_edge(limiter, function);
+
+            limiter.try_put(occupy_concurrency_message);
+            wo_buffer.try_put_and_wait(wait_message);
+
+            std::size_t check_index = 0;
+            CHECK_MESSAGE(processed_items.size() == 2, "unexpected message processing for try_put_and_wait");
+            CHECK_MESSAGE(processed_items[check_index++] == occupy_concurrency_message,
+                          "occupy_concurrency_message should be processed first");
+            CHECK_MESSAGE(processed_items[check_index++] == wait_message,
+                          "wait_message was not processed");
+
+            g.wait_for_all();
+
+            CHECK_MESSAGE(processed_items[check_index++] == new_message,
+                          "only the new_message should be processed in wait_for_all");
+            CHECK(check_index == processed_items.size());
+        });
+    }
+}
+#endif
+
 //! Test read-write properties
 //! \brief \ref requirement \ref error_guessing
 TEST_CASE("Read-write tests"){
@@ -244,3 +373,10 @@ TEST_CASE("Deduction guides"){
     test_deduction_guides();
 }
 #endif
+
+#if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT
+//! \brief \ref error_guessing
+TEST_CASE("test write_once_node try_put_and_wait") {
+    test_try_put_and_wait();
+}
+#endif
diff --git a/test/tbbmalloc/test_malloc_atexit.cpp b/test/tbbmalloc/test_malloc_atexit.cpp
index 3442233044..19ccc4d640 100644
--- a/test/tbbmalloc/test_malloc_atexit.cpp
+++ b/test/tbbmalloc/test_malloc_atexit.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@
 #if _USRDLL
 #if _WIN32||_WIN64
 extern __declspec(dllexport)
+#else
+__TBB_EXPORT
 #endif
 bool dll_isMallocOverloaded()
 #else
diff --git a/test/tbbmalloc/test_malloc_lib_unload.cpp b/test/tbbmalloc/test_malloc_lib_unload.cpp
index b2e515423d..441a221d0c 100644
--- a/test/tbbmalloc/test_malloc_lib_unload.cpp
+++ b/test/tbbmalloc/test_malloc_lib_unload.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -23,24 +23,28 @@
 const char *globalCallMsg = "A TBB allocator function call is resolved into wrong implementation.";
 
 #if _WIN32||_WIN64
+    #define TEST_EXPORT __declspec(dllexport)
+#else
+    #define TEST_EXPORT __TBB_EXPORT
+#endif
+
 // must be defined in DLL for linker to not drop the dependency on the DLL.
 extern "C" {
-    extern __declspec(dllexport) void *scalable_malloc(size_t);
-    extern __declspec(dllexport) void scalable_free (void *);
-    extern __declspec(dllexport) void safer_scalable_free (void *, void (*)(void*));
-    extern __declspec(dllexport) void *scalable_realloc(void *, size_t);
-    extern __declspec(dllexport) void *safer_scalable_realloc(void *, size_t, void *);
-    extern __declspec(dllexport) void *scalable_calloc(size_t, size_t);
-    extern __declspec(dllexport) int scalable_posix_memalign(void **, size_t, size_t);
-    extern __declspec(dllexport) void *scalable_aligned_malloc(size_t, size_t);
-    extern __declspec(dllexport) void *scalable_aligned_realloc(void *, size_t, size_t);
-    extern __declspec(dllexport) void *safer_scalable_aligned_realloc(void *, size_t, size_t, void *);
-    extern __declspec(dllexport) void scalable_aligned_free(void *);
-    extern __declspec(dllexport) size_t scalable_msize(void *);
-    extern __declspec(dllexport) size_t safer_scalable_msize (void *, size_t (*)(void*));
-    extern __declspec(dllexport) int anchor();
+    extern TEST_EXPORT void *scalable_malloc(size_t);
+    extern TEST_EXPORT void scalable_free (void *);
+    extern TEST_EXPORT void safer_scalable_free (void *, void (*)(void*));
+    extern TEST_EXPORT void *scalable_realloc(void *, size_t);
+    extern TEST_EXPORT void *safer_scalable_realloc(void *, size_t, void *);
+    extern TEST_EXPORT void *scalable_calloc(size_t, size_t);
+    extern TEST_EXPORT int scalable_posix_memalign(void **, size_t, size_t);
+    extern TEST_EXPORT void *scalable_aligned_malloc(size_t, size_t);
+    extern TEST_EXPORT void *scalable_aligned_realloc(void *, size_t, size_t);
+    extern TEST_EXPORT void *safer_scalable_aligned_realloc(void *, size_t, size_t, void *);
+    extern TEST_EXPORT void scalable_aligned_free(void *);
+    extern TEST_EXPORT size_t scalable_msize(void *);
+    extern TEST_EXPORT size_t safer_scalable_msize (void *, size_t (*)(void*));
+    extern TEST_EXPORT int anchor();
 }
-#endif
 
 extern "C" int anchor() {
     return 42;
diff --git a/test/tbbmalloc/test_malloc_used_by_lib.cpp b/test/tbbmalloc/test_malloc_used_by_lib.cpp
index 43c76806ed..dccddd7e28 100644
--- a/test/tbbmalloc/test_malloc_used_by_lib.cpp
+++ b/test/tbbmalloc/test_malloc_used_by_lib.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (c) 2005-2022 Intel Corporation
+    Copyright (c) 2005-2024 Intel Corporation
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
@@ -23,11 +23,13 @@
 #include "common/utils_assert.h"
 #include "tbb/scalable_allocator.h"
 
-#if _WIN32||_WIN64
 extern "C" {
+#if _WIN32||_WIN64
     extern __declspec(dllexport) void callDll();
-}
+#else
+    extern __TBB_EXPORT void callDll();
 #endif
+}
 
 extern "C" void callDll()
 {